diff --git a/lib/galaxy/config/sample/object_store_conf.sample.yml b/lib/galaxy/config/sample/object_store_conf.sample.yml index b1b2cb34afec..0c96a549f22d 100644 --- a/lib/galaxy/config/sample/object_store_conf.sample.yml +++ b/lib/galaxy/config/sample/object_store_conf.sample.yml @@ -135,10 +135,64 @@ backends: store_by: uuid files_dir: /old-fs/galaxy/files + +# There are now four ways to access S3 related services. Two are +# suitable just for AWS services (aws_s3 & cloud), one is +# more suited for non-AWS S3 compatible services (generic_s3), +# and finally boto3 gracefully handles either scenario. +# +# boto3 is built on the newest and most widely used Python client +# outside of Galaxy. It has advanced transfer options and is likely +# the client you should use for new setup. generic_s3 and aws_s3 +# have existed in Galaxy for longer and could perhaps be considered +# more battle tested. Both boto3 and generic_s3 have been tested +# with multiple non-AWS APIs including minio and GCP. The cloud +# implementation is based on CloudBridge and is still supported +# and has been recently tested - the downside is mostly the advanced +# multi-threaded processing options of boto3 are not available +# and it has not been battle tested like aws_s3. + +# +# Sample AWS S3 Object Store configuration (newest boto3 client) +# +type: boto3 +auth: + access_key: ... + secret_key: ... +bucket: + name: unique_bucket_name_all_lowercase +connection: # not strictly needed but more of the API works with this. + region: us-east-1 +transfer: + multipart_threshold: 10000000 + download_max_concurrency: 5 + upload_max_concurrency: 10 + # any of these options: + # multipart_threshold, max_concurrency, multipart_chunksize, + # num_download_attempts, max_io_queue, io_chunksize, use_threads, + # and max_bandwidth + # can be set. By default they will apply to uploads and downloads + # but they can be prefixed with upload_ or download_ as shown above + # to apply to just one scenario. More information about these parameters + # can be found at: + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig + +cache: + path: database/object_store_cache_s3 + size: 1000 + cache_updated_data: true +extra_dirs: + - type: job_work + path: database/job_working_directory_s3 + + + # -# Sample AWS S3 Object Store configuration +# Sample AWS S3 Object Store configuration (legacy boto implementation) # - +# This implementation will use axel automatically for file transfers if it is on +# Galaxy's path. Otherwise, it will use various python-based strategies for multi-part +# upload of large uploads but all downloads will be single threaded. type: aws_s3 auth: access_key: ... @@ -147,6 +201,8 @@ bucket: name: unique_bucket_name_all_lowercase use_reduced_redundancy: false max_chunk_size: 250 +connection: # not strictly needed but more of the API works with this. + region: us-east-1 cache: path: database/object_store_cache_s3 size: 1000 @@ -182,7 +238,32 @@ extra_dirs: path: database/job_working_directory_irods # -# Sample non-AWS S3 Object Store (e.g. swift) configuration +# Sample non-AWS S3 Object Store (e.g. swift) configuration (boto3) +# + +type: boto3 +auth: + access_key: ... + secret_key: ... +bucket: + name: unique_bucket_name_all_lowercase +connection: + endpoint_url: https://swift.example.org:6000/ + # region: some services may make use of region is specified. + # older style host, port, secure, and conn_path available to generic_s3 work + # here also - Galaxy will just infer a endpoint_url from those. +cache: + path: database/object_store_cache_swift + size: 1000 + cache_updated_data: true +# transfer: # see transfer options for boto3 above in AWS configuration. +extra_dirs: + - type: job_work + path: database/job_working_directory_swift + + +# +# Sample non-AWS S3 Object Store (e.g. swift) configuration (legacy boto client) # type: generic_s3 diff --git a/lib/galaxy/dependencies/__init__.py b/lib/galaxy/dependencies/__init__.py index 0bb785aa136a..44322353329a 100644 --- a/lib/galaxy/dependencies/__init__.py +++ b/lib/galaxy/dependencies/__init__.py @@ -234,6 +234,9 @@ def check_python_pam(self): def check_azure_storage(self): return "azure_blob" in self.object_stores + def check_boto3(self): + return "boto3" in self.object_stores + def check_kamaki(self): return "pithos" in self.object_stores diff --git a/lib/galaxy/dependencies/dev-requirements.txt b/lib/galaxy/dependencies/dev-requirements.txt index 57a4854417dc..9ac31e86937c 100644 --- a/lib/galaxy/dependencies/dev-requirements.txt +++ b/lib/galaxy/dependencies/dev-requirements.txt @@ -10,6 +10,7 @@ babel==2.14.0 ; python_version >= "3.8" and python_version < "3.13" backports-tarfile==1.1.1 ; python_version >= "3.8" and python_version < "3.12" backports-zoneinfo==0.2.1 ; python_version >= "3.8" and python_version < "3.9" black==24.4.2 ; python_version >= "3.8" and python_version < "3.13" +boto3==1.34.69 ; python_version >= "3.8" and python_version < "3.13" build==1.2.1 ; python_version >= "3.8" and python_version < "3.13" cachecontrol[filecache]==0.14.0 ; python_version >= "3.8" and python_version < "3.13" certifi==2024.2.2 ; python_version >= "3.8" and python_version < "3.13" diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py index 62d49420e5e6..412228167971 100644 --- a/lib/galaxy/objectstore/__init__.py +++ b/lib/galaxy/objectstore/__init__.py @@ -55,7 +55,10 @@ from .caching import CacheTarget if TYPE_CHECKING: - from galaxy.model import DatasetInstance + from galaxy.model import ( + Dataset, + DatasetInstance, + ) NO_SESSION_ERROR_MESSAGE = ( "Attempted to 'create' object store entity in configuration with no database session present." @@ -373,16 +376,6 @@ def shutdown(self): """Close any connections for this ObjectStore.""" self.running = False - def file_ready( - self, obj, base_dir=None, dir_only=False, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False - ): - """ - Check if a file corresponding to a dataset is ready to be used. - - Return True if so, False otherwise - """ - return True - @classmethod def parse_xml(clazz, config_xml): """Parse an XML description of a configuration for this object store. @@ -938,10 +931,6 @@ def _exists(self, obj, **kwargs): """Determine if the `obj` exists in any of the backends.""" return self._call_method("_exists", obj, False, False, **kwargs) - def file_ready(self, obj, **kwargs): - """Determine if the file for `obj` is ready to be used by any of the backends.""" - return self._call_method("file_ready", obj, False, False, **kwargs) - def _create(self, obj, **kwargs): """Create a backing file in a random backend.""" objectstore = random.choice(list(self.backends.values())) @@ -1400,6 +1389,10 @@ def type_to_object_store_class(store: str, fsmon: bool = False) -> Tuple[Type[Ba objectstore_constructor_kwds = {} if store == "disk": objectstore_class = DiskObjectStore + elif store == "boto3": + from .s3_boto3 import S3ObjectStore as Boto3ObjectStore + + objectstore_class = Boto3ObjectStore elif store in ["s3", "aws_s3"]: from .s3 import S3ObjectStore @@ -1672,18 +1665,27 @@ def persist_extra_files( if not extra_files_path_name: extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store) assert extra_files_path_name - for root, _dirs, files in safe_walk(src_extra_files_path): - extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path)) - extra_dir = os.path.normpath(extra_dir) - for f in files: - if not in_directory(f, src_extra_files_path): - # Unclear if this can ever happen if we use safe_walk ... probably not ? - raise MalformedContents(f"Invalid dataset path: {f}") - object_store.update_from_file( - primary_data.dataset, - extra_dir=extra_dir, - alt_name=f, - file_name=os.path.join(root, f), - create=True, - preserve_symlinks=True, - ) + persist_extra_files_for_dataset(object_store, src_extra_files_path, primary_data.dataset, extra_files_path_name) + + +def persist_extra_files_for_dataset( + object_store: ObjectStore, + src_extra_files_path: str, + dataset: "Dataset", + extra_files_path_name: str, +): + for root, _dirs, files in safe_walk(src_extra_files_path): + extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path)) + extra_dir = os.path.normpath(extra_dir) + for f in files: + if not in_directory(f, src_extra_files_path): + # Unclear if this can ever happen if we use safe_walk ... probably not ? + raise MalformedContents(f"Invalid dataset path: {f}") + object_store.update_from_file( + dataset, + extra_dir=extra_dir, + alt_name=f, + file_name=os.path.join(root, f), + create=True, + preserve_symlinks=True, + ) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py new file mode 100644 index 000000000000..b63593ec7c50 --- /dev/null +++ b/lib/galaxy/objectstore/_caching_base.py @@ -0,0 +1,406 @@ +import logging +import os +import shutil +from datetime import datetime +from typing import ( + Any, + Dict, + Optional, +) + +from galaxy.exceptions import ( + ObjectInvalid, + ObjectNotFound, +) +from galaxy.objectstore import ConcreteObjectStore +from galaxy.util import ( + directory_hash_id, + unlink, +) +from galaxy.util.path import safe_relpath +from ._util import fix_permissions +from .caching import ( + CacheTarget, + InProcessCacheMonitor, +) + +log = logging.getLogger(__name__) + + +class CachingConcreteObjectStore(ConcreteObjectStore): + staging_path: str + extra_dirs: Dict[str, str] + config: Any + cache_updated_data: bool + enable_cache_monitor: bool + cache_size: int + cache_monitor: Optional[InProcessCacheMonitor] = None + cache_monitor_interval: int + + def _ensure_staging_path_writable(self): + staging_path = self.staging_path + if not os.path.exists(staging_path): + os.makedirs(staging_path, exist_ok=True) + if not os.path.exists(staging_path): + raise Exception(f"Caching object store created with path '{staging_path}' that does not exist") + + if not os.access(staging_path, os.R_OK): + raise Exception(f"Caching object store created with path '{staging_path}' that does not readable") + if not os.access(staging_path, os.W_OK): + raise Exception(f"Caching object store created with path '{staging_path}' that does not writable") + + def _construct_path( + self, + obj, + base_dir=None, + dir_only=None, + extra_dir=None, + extra_dir_at_root=False, + alt_name=None, + obj_dir=False, + in_cache=False, + **kwargs, + ): + # extra_dir should never be constructed from provided data but just + # make sure there are no shenannigans afoot + if extra_dir and extra_dir != os.path.normpath(extra_dir): + log.warning("extra_dir is not normalized: %s", extra_dir) + raise ObjectInvalid("The requested object is invalid") + # ensure that any parent directory references in alt_name would not + # result in a path not contained in the directory path constructed here + if alt_name: + if not safe_relpath(alt_name): + log.warning("alt_name would locate path outside dir: %s", alt_name) + raise ObjectInvalid("The requested object is invalid") + # alt_name can contain parent directory references, but S3 will not + # follow them, so if they are valid we normalize them out + alt_name = os.path.normpath(alt_name) + + object_id = self._get_object_id(obj) + rel_path = os.path.join(*directory_hash_id(object_id)) + + if extra_dir is not None: + if extra_dir_at_root: + rel_path = os.path.join(extra_dir, rel_path) + else: + rel_path = os.path.join(rel_path, extra_dir) + + # for JOB_WORK directory + if obj_dir: + rel_path = os.path.join(rel_path, str(object_id)) + if base_dir: + base = self.extra_dirs.get(base_dir) + assert base + return os.path.join(base, rel_path) + + # This is how the remote file stores represent folders + rel_path = f"{rel_path}/" + + if not dir_only: + rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{object_id}.dat") + + if in_cache: + return self._get_cache_path(rel_path) + + return rel_path + + def _get_cache_path(self, rel_path: str) -> str: + return os.path.abspath(os.path.join(self.staging_path, rel_path)) + + def _in_cache(self, rel_path: str) -> bool: + """Check if the given dataset is in the local cache and return True if so.""" + cache_path = self._get_cache_path(rel_path) + return os.path.exists(cache_path) + + def _pull_into_cache(self, rel_path) -> bool: + # Ensure the cache directory structure exists (e.g., dataset_#_files/) + rel_path_dir = os.path.dirname(rel_path) + if not os.path.exists(self._get_cache_path(rel_path_dir)): + os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) + # Now pull in the file + file_ok = self._download(rel_path) + if file_ok: + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) + else: + unlink(self._get_cache_path(rel_path), ignore_errors=True) + return file_ok + + def _get_data(self, obj, start=0, count=-1, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + # Check cache first and get file if not there + if not self._in_cache(rel_path): + self._pull_into_cache(rel_path) + # Read the file content from cache + data_file = open(self._get_cache_path(rel_path)) + data_file.seek(start) + content = data_file.read(count) + data_file.close() + return content + + def _exists(self, obj, **kwargs): + in_cache = exists_remotely = False + rel_path = self._construct_path(obj, **kwargs) + dir_only = kwargs.get("dir_only", False) + base_dir = kwargs.get("base_dir", None) + + # check job work directory stuff early to skip API hits. + if dir_only and base_dir: + if not os.path.exists(rel_path): + os.makedirs(rel_path, exist_ok=True) + return True + + in_cache = self._in_cache(rel_path) + exists_remotely = self._exists_remotely(rel_path) + dir_only = kwargs.get("dir_only", False) + base_dir = kwargs.get("base_dir", None) + if dir_only: + if in_cache or exists_remotely: + return True + else: + return False + + # TODO: Sync should probably not be done here. Add this to an async upload stack? + if in_cache and not exists_remotely: + self._push_to_storage(rel_path, source_file=self._get_cache_path(rel_path)) + return True + elif exists_remotely: + return True + else: + return False + + def _create(self, obj, **kwargs): + if not self._exists(obj, **kwargs): + # Pull out locally used fields + extra_dir = kwargs.get("extra_dir", None) + extra_dir_at_root = kwargs.get("extra_dir_at_root", False) + dir_only = kwargs.get("dir_only", False) + alt_name = kwargs.get("alt_name", None) + + # Construct hashed path + rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) + + # Optionally append extra_dir + if extra_dir is not None: + if extra_dir_at_root: + rel_path = os.path.join(extra_dir, rel_path) + else: + rel_path = os.path.join(rel_path, extra_dir) + + # Create given directory in cache + cache_dir = os.path.join(self.staging_path, rel_path) + if not os.path.exists(cache_dir): + os.makedirs(cache_dir, exist_ok=True) + + # If instructed, create the dataset in cache & in S3 + if not dir_only: + rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") + open(os.path.join(self.staging_path, rel_path), "w").close() + self._push_to_storage(rel_path, from_string="") + return self + + def _caching_allowed(self, rel_path: str, remote_size: Optional[int] = None) -> bool: + if remote_size is None: + remote_size = self._get_remote_size(rel_path) + if not self.cache_target.fits_in_cache(remote_size): + log.critical( + "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.", + rel_path, + remote_size, + self.cache_target.log_description, + ) + return False + return True + + def _push_to_storage(self, rel_path, source_file=None, from_string=None): + source_file = source_file or self._get_cache_path(rel_path) + if from_string is None and not os.path.exists(source_file): + log.error( + "Tried updating remote path '%s' from source file '%s', but source file does not exist.", + rel_path, + source_file, + ) + return False + + if from_string is None and os.path.getsize(source_file) == 0: + log.debug( + "Wanted to push file '%s' to remote path '%s' but its size is 0; skipping.", source_file, rel_path + ) + return True + + if from_string is not None: + return self._push_string_to_path(rel_path, from_string) + else: + start_time = datetime.now() + log.debug( + "Pushing cache file '%s' of size %s bytes to '%s'", + source_file, + os.path.getsize(source_file), + rel_path, + ) + success = self._push_file_to_path(rel_path, source_file) + end_time = datetime.now() + log.debug( + "Pushed cache file '%s' to blob '%s' (%s bytes transferred in %s sec)", + source_file, + rel_path, + os.path.getsize(source_file), + end_time - start_time, + ) + return success + + def _empty(self, obj, **kwargs): + if self._exists(obj, **kwargs): + return self._size(obj, **kwargs) == 0 + else: + raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") + + def _size(self, obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + if self._in_cache(rel_path): + try: + return os.path.getsize(self._get_cache_path(rel_path)) + except OSError as ex: + log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) + elif self._exists_remotely(rel_path): + return self._get_remote_size(rel_path) + log.warning("Did not find dataset '%s', returning 0 for size", rel_path) + return 0 + + def _get_filename(self, obj, **kwargs): + base_dir = kwargs.get("base_dir", None) + dir_only = kwargs.get("dir_only", False) + obj_dir = kwargs.get("obj_dir", False) + sync_cache = kwargs.get("sync_cache", True) + + rel_path = self._construct_path(obj, **kwargs) + + # for JOB_WORK directory + if base_dir and dir_only and obj_dir: + return os.path.abspath(rel_path) + + cache_path = self._get_cache_path(rel_path) + if not sync_cache: + return cache_path + + # Check if the file exists in the cache first, always pull if file size in cache is zero + # For dir_only - the cache cleaning may have left empty directories so I think we need to + # always resync the cache. Gotta make sure we're being judicious in out data.extra_files_path + # calls I think. + if not dir_only and self._in_cache(rel_path) and os.path.getsize(self._get_cache_path(rel_path)) > 0: + return cache_path + + # Check if the file exists in persistent storage and, if it does, pull it into cache + elif self._exists(obj, **kwargs): + if dir_only: + self._download_directory_into_cache(rel_path, cache_path) + return cache_path + else: + if self._pull_into_cache(rel_path): + return cache_path + raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") + + def _download_directory_into_cache(self, rel_path, cache_path): + # pithos & irods never did this prior to refactoring so I am assuming + # there is just operations that fail with these object stores. + # As part of the refactoring that resulted in this method + # https://github.com/galaxyproject/galaxy/pull/18117 I wrote test + # cases and I verified the other object stores that didn't implement + # this had issues - I implemented this new functionality in the + # Azure and Cloud object stores to fix those object stores. New + # object stores should definitely override this. + pass + + def _delete(self, obj, entire_dir=False, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + extra_dir = kwargs.get("extra_dir", None) + base_dir = kwargs.get("base_dir", None) + dir_only = kwargs.get("dir_only", False) + obj_dir = kwargs.get("obj_dir", False) + try: + # Remove temporary data in JOB_WORK directory + if base_dir and dir_only and obj_dir: + shutil.rmtree(os.path.abspath(rel_path)) + return True + + # For the case of extra_files, because we don't have a reference to + # individual files/keys we need to remove the entire directory structure + # with all the files in it. This is easy for the local file system, + # but requires iterating through each individual key in S3 and deleing it. + if entire_dir and extra_dir: + shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) + return self._delete_remote_all(rel_path) + else: + # Delete from cache first + unlink(self._get_cache_path(rel_path), ignore_errors=True) + # Delete from S3 as well + if self._exists_remotely(rel_path): + return self._delete_existing_remote(rel_path) + except OSError: + log.exception("%s delete error", self._get_filename(obj, **kwargs)) + return False + + def _update_from_file(self, obj, file_name=None, create=False, **kwargs): + if create: + self._create(obj, **kwargs) + + if self._exists(obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + # Chose whether to use the dataset file itself or an alternate file + if file_name: + source_file = os.path.abspath(file_name) + # Copy into cache + cache_file = self._get_cache_path(rel_path) + try: + if source_file != cache_file and self.cache_updated_data: + # FIXME? Should this be a `move`? + shutil.copy2(source_file, cache_file) + fix_permissions(self.config, cache_file) + except OSError: + log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) + else: + source_file = self._get_cache_path(rel_path) + + self._push_to_storage(rel_path, source_file) + + else: + raise ObjectNotFound( + f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}" + ) + + @property + def cache_target(self) -> CacheTarget: + return CacheTarget( + self.staging_path, + self.cache_size, + 0.9, + ) + + def _shutdown_cache_monitor(self) -> None: + self.cache_monitor and self.cache_monitor.shutdown() + + def _start_cache_monitor_if_needed(self): + if self.enable_cache_monitor: + self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + + def _get_remote_size(self, rel_path: str) -> int: + raise NotImplementedError() + + def _exists_remotely(self, rel_path: str) -> bool: + raise NotImplementedError() + + def _download(self, rel_path: str) -> bool: + raise NotImplementedError() + + # Do not need to override these if instead replacing _delete + def _delete_existing_remote(self, rel_path) -> bool: + raise NotImplementedError() + + def _delete_remote_all(self, rel_path) -> bool: + raise NotImplementedError() + + # Do not need to override these if instead replacing _push_to_storage + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: + raise NotImplementedError() + + def _push_file_to_path(self, rel_path: str, target_file: str) -> bool: + raise NotImplementedError() diff --git a/lib/galaxy/objectstore/_util.py b/lib/galaxy/objectstore/_util.py new file mode 100644 index 000000000000..fbdf9adde4f6 --- /dev/null +++ b/lib/galaxy/objectstore/_util.py @@ -0,0 +1,35 @@ +import multiprocessing +import os +import subprocess + +from galaxy.util import ( + umask_fix_perms, + which, +) + + +def fix_permissions(config, rel_path: str): + """Set permissions on rel_path""" + for basedir, _, files in os.walk(rel_path): + umask_fix_perms(basedir, config.umask, 0o777, config.gid) + for filename in files: + path = os.path.join(basedir, filename) + # Ignore symlinks + if os.path.islink(path): + continue + umask_fix_perms(path, config.umask, 0o666, config.gid) + + +class UsesAxel: + use_axel: bool + + def _init_axel(self) -> None: + if which("axel"): + self.use_axel = True + else: + self.use_axel = False + + def _axel_download(self, url: str, path: str): + ncores = multiprocessing.cpu_count() + ret_code = subprocess.call(["axel", "-a", "-o", path, "-n", str(ncores), url]) + return ret_code == 0 diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 39e3c7490eb1..d3008ac77ccc 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -4,12 +4,10 @@ import logging import os -import shutil from datetime import ( datetime, timedelta, ) -from typing import Optional try: from azure.common import AzureHttpError @@ -19,23 +17,11 @@ generate_blob_sas, ) except ImportError: - BlobServiceClient = None + BlobServiceClient = None # type: ignore[assignment,unused-ignore,misc] -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) -from galaxy.util import ( - directory_hash_id, - umask_fix_perms, - unlink, -) -from galaxy.util.path import safe_relpath -from . import ConcreteObjectStore +from ._caching_base import CachingConcreteObjectStore from .caching import ( - CacheTarget, enable_cache_monitor, - InProcessCacheMonitor, parse_caching_config_dict_from_xml, ) @@ -57,7 +43,24 @@ def parse_config_xml(config_xml): container_xml = config_xml.find("container") container_name = container_xml.get("name") - max_chunk_size = int(container_xml.get("max_chunk_size", 250)) # currently unused + + transfer_xml = config_xml.findall("transfer") + if not transfer_xml: + transfer_xml = {} + else: + transfer_xml = transfer_xml[0] + transfer_dict = {} + for key in [ + "max_concurrency", + "download_max_concurrency", + "upload_max_concurrency", + "max_single_put_size", + "max_single_get_size", + "max_block_size", + ]: + value = transfer_xml.get(key) + if transfer_xml.get(key) is not None: + transfer_dict[key] = value cache_dict = parse_caching_config_dict_from_xml(config_xml) @@ -79,11 +82,11 @@ def parse_config_xml(config_xml): "auth": auth, "container": { "name": container_name, - "max_chunk_size": max_chunk_size, }, "cache": cache_dict, + "transfer": transfer_dict, "extra_dirs": extra_dirs, - "private": ConcreteObjectStore.parse_private_from_config_xml(config_xml), + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), } except Exception: # Toss it back up after logging, we can't continue loading at this point. @@ -91,14 +94,13 @@ def parse_config_xml(config_xml): raise -class AzureBlobObjectStore(ConcreteObjectStore): +class AzureBlobObjectStore(CachingConcreteObjectStore): """ Object store that stores objects as blobs in an Azure Blob Container. A local cache exists that is used as an intermediate location for files between Galaxy and Azure. """ - cache_monitor: Optional[InProcessCacheMonitor] = None store_type = "azure_blob" def __init__(self, config, config_dict): @@ -114,7 +116,20 @@ def __init__(self, config, config_dict): self.account_key = auth_dict.get("account_key") self.container_name = container_dict.get("name") - self.max_chunk_size = container_dict.get("max_chunk_size", 250) # currently unused + raw_transfer_dict = config_dict.get("transfer", {}) + typed_transfer_dict = {} + for key in [ + "max_concurrency", + "download_max_concurrency", + "upload_max_concurrency", + "max_single_put_size", + "max_single_get_size", + "max_block_size", + ]: + value = raw_transfer_dict.get(key) + if value is not None: + typed_transfer_dict[key] = int(value) + self.transfer_dict = typed_transfer_dict self.cache_size = cache_dict.get("size") or self.config.object_store_cache_size self.staging_path = cache_dict.get("path") or self.config.object_store_cache_path @@ -127,9 +142,8 @@ def _initialize(self): raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE) self._configure_connection() - - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + self._ensure_staging_path_writable() + self._start_cache_monitor_if_needed() def to_dict(self): as_dict = super().to_dict() @@ -144,8 +158,8 @@ def to_dict(self): "auth": auth, "container": { "name": self.container_name, - "max_chunk_size": self.max_chunk_size, }, + "transfer": self.transfer_dict, "cache": { "size": self.cache_size, "path": self.staging_path, @@ -155,10 +169,6 @@ def to_dict(self): ) return as_dict - ################### - # Private Methods # - ################### - # config_xml is an ElementTree object. @classmethod def parse_xml(clazz, config_xml): @@ -166,87 +176,31 @@ def parse_xml(clazz, config_xml): def _configure_connection(self): log.debug("Configuring Connection") + extra_kwds = {} + for key in [ + "max_single_put_size", + "max_single_get_size", + "max_block_size", + ]: + if key in self.transfer_dict: + extra_kwds[key] = self.transfer_dict[key] + if self.account_url: # https://pypi.org/project/azure-storage-blob/ service = BlobServiceClient( account_url=self.account_url, credential={"account_name": self.account_name, "account_key": self.account_key}, + **extra_kwds, ) else: service = BlobServiceClient( account_url=f"https://{self.account_name}.blob.core.windows.net", credential=self.account_key, + **extra_kwds, ) self.service = service - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # S3 folders are marked by having trailing '/' so add it now - # rel_path = '%s/' % rel_path # assume for now we don't need this in Azure blob storage. - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - - def _get_size_in_azure(self, rel_path): + def _get_remote_size(self, rel_path): try: properties = self._blob_client(rel_path).get_blob_properties() size_in_bytes = properties.size @@ -255,9 +209,20 @@ def _get_size_in_azure(self, rel_path): log.exception("Could not get size of blob '%s' from Azure", rel_path) return -1 - def _in_azure(self, rel_path): + def _blobs_from(self, rel_path): + return self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) + + def _exists_remotely(self, rel_path: str): try: - exists = self._blob_client(rel_path).exists() + is_dir = rel_path[-1] == "/" + if is_dir: + blobs = self._blobs_from(rel_path) + if blobs: + return True + else: + return False + else: + exists = self._blob_client(rel_path).exists() except AzureHttpError: log.exception("Trouble checking existence of Azure blob '%s'", rel_path) return False @@ -266,308 +231,82 @@ def _in_azure(self, rel_path): def _blob_client(self, rel_path: str): return self.service.get_blob_client(self.container_name, rel_path) - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache.""" - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) - return file_ok - def _download(self, rel_path): local_destination = self._get_cache_path(rel_path) try: log.debug("Pulling '%s' into cache to %s", rel_path, local_destination) - if not self.cache_target.fits_in_cache(self._get_size_in_azure(rel_path)): - log.critical( - "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.", - rel_path, - self._get_size_in_azure(rel_path), - self.cache_target.log_description, - ) + if not self._caching_allowed(rel_path): return False else: - with open(local_destination, "wb") as f: - self._blob_client(rel_path).download_blob().download_to_stream(f) + self._download_to_file(rel_path, local_destination) return True except AzureHttpError: log.exception("Problem downloading '%s' from Azure", rel_path) return False - def _push_to_os(self, rel_path, source_file=None, from_string=None): - """ - Push the file pointed to by ``rel_path`` to the object store naming the blob - ``rel_path``. If ``source_file`` is provided, push that file instead while - still using ``rel_path`` as the blob name. - If ``from_string`` is provided, set contents of the file to the value of - the string. - """ - try: - source_file = source_file or self._get_cache_path(rel_path) + def _download_to_file(self, rel_path, local_destination): + kwd = {} + max_concurrency = self.transfer_dict.get("download_max_concurrency") or self.transfer_dict.get( + "max_concurrency" + ) + if max_concurrency is not None: + kwd["max_concurrency"] = max_concurrency + with open(local_destination, "wb") as f: + self._blob_client(rel_path).download_blob().download_to_stream(f, **kwd) - if from_string is None and not os.path.exists(source_file): - log.error( - "Tried updating blob '%s' from source file '%s', but source file does not exist.", - rel_path, - source_file, - ) - return False + def _download_directory_into_cache(self, rel_path, cache_path): + blobs = self._blobs_from(rel_path) + for blob in blobs: + key = blob.name + local_file_path = os.path.join(cache_path, os.path.relpath(key, rel_path)) - if from_string is None and os.path.getsize(source_file) == 0: - log.debug( - "Wanted to push file '%s' to azure blob '%s' but its size is 0; skipping.", source_file, rel_path - ) - return True + # Create directories if they don't exist + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) - if from_string is not None: - self._blob_client(rel_path).upload_blob(from_string, overwrite=True) - log.debug("Pushed data from string '%s' to blob '%s'", from_string, rel_path) - else: - start_time = datetime.now() - log.debug( - "Pushing cache file '%s' of size %s bytes to '%s'", - source_file, - os.path.getsize(source_file), - rel_path, - ) - with open(source_file, "rb") as f: - self._blob_client(rel_path).upload_blob(f, overwrite=True) - end_time = datetime.now() - log.debug( - "Pushed cache file '%s' to blob '%s' (%s bytes transferred in %s sec)", - source_file, - rel_path, - os.path.getsize(source_file), - end_time - start_time, - ) - return True + # Download the file + self._download_to_file(key, local_file_path) + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: + try: + self._blob_client(rel_path).upload_blob(from_string, overwrite=True) + return True except AzureHttpError: - log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file) - return False - - ################## - # Public Methods # - ################## - - def _exists(self, obj, **kwargs): - in_cache = in_azure = False - rel_path = self._construct_path(obj, **kwargs) - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) + log.exception("Trouble pushing to Azure Blob '%s' from string", rel_path) + return False - # check job work directory stuff early to skip API hits. - if dir_only and base_dir: - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) + def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: + try: + with open(source_file, "rb") as f: + kwd = {} + max_concurrency = self.transfer_dict.get("upload_max_concurrency") or self.transfer_dict.get( + "max_concurrency" + ) + if max_concurrency is not None: + kwd["max_concurrency"] = max_concurrency + self._blob_client(rel_path).upload_blob(f, overwrite=True, **kwd) return True + except AzureHttpError: + log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file) + return False - in_cache = self._in_cache(rel_path) - in_azure = self._in_azure(rel_path) - # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure)) - # dir_only does not get synced so shortcut the decision - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - if dir_only: - if in_cache or in_azure: - return True - else: - return False - - # TODO: Sync should probably not be done here. Add this to an async upload stack? - if in_cache and not in_azure: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) - return True - elif in_azure: + def _delete_remote_all(self, rel_path: str) -> bool: + try: + blobs = self._blobs_from(rel_path) + for blob in blobs: + log.debug("Deleting from Azure: %s", blob) + self._blob_client(blob.name).delete_blob() return True - else: + except AzureHttpError: + log.exception("Could not delete blob '%s' from Azure", rel_path) return False - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - local_size = os.path.getsize(self._get_cache_path(rel_path)) - remote_size = self._get_size_in_azure(rel_path) - if local_size == remote_size: - return True - else: - log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, local_size, remote_size) - - return False - - def _create(self, obj, **kwargs): - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - # Although not really necessary to create S3 folders (because S3 has - # flat namespace), do so for consistency with the regular file system - # S3 folders are marked by having trailing '/' so add it now - # s3_dir = '%s/' % rel_path - # self._push_to_os(s3_dir, from_string='') - # If instructed, create the dataset in cache & in S3 - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") - return self - - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - size = self._size(obj, **kwargs) - is_empty = bool(size == 0) - return is_empty - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {str(obj)}, kwargs: {str(kwargs)}") - - def _size(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) - elif self._exists(obj, **kwargs): - return self._get_size_in_azure(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - return 0 - - def _delete(self, obj, entire_dir=False, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - extra_dir = kwargs.get("extra_dir", None) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) + def _delete_existing_remote(self, rel_path: str) -> bool: try: - if base_dir and dir_only and obj_dir: - # Remove temporary data in JOB_WORK directory - shutil.rmtree(os.path.abspath(rel_path)) - return True - - # For the case of extra_files, because we don't have a reference to - # individual files/blobs we need to remove the entire directory structure - # with all the files in it. This is easy for the local file system, - # but requires iterating through each individual blob in Azure and deleing it. - if entire_dir and extra_dir: - shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - blobs = self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) - for blob in blobs: - log.debug("Deleting from Azure: %s", blob) - self._blob_client(blob.name).delete_blob() - return True - else: - # Delete from cache first - unlink(self._get_cache_path(rel_path), ignore_errors=True) - # Delete from S3 as well - if self._in_azure(rel_path): - log.debug("Deleting from Azure: %s", rel_path) - self._blob_client(rel_path).delete_blob() - return True + self._blob_client(rel_path).delete_blob() + return True except AzureHttpError: log.exception("Could not delete blob '%s' from Azure", rel_path) - except OSError: - log.exception("%s delete error", self._get_filename(obj, **kwargs)) - return False - - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path): - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - - def _get_filename(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - sync_cache = kwargs.get("sync_cache", True) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: # Directories do not get pulled into cache - return cache_path - else: - if self._pull_into_cache(rel_path): - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {str(obj)}, kwargs: {str(kwargs)}") - - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - if create is True: - self._create(obj, **kwargs) - - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Chose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - - self._push_to_os(rel_path, source_file) - - else: - raise ObjectNotFound( - f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}" - ) + return False def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): @@ -593,13 +332,5 @@ def _get_store_usage_percent(self, obj): # https://learn.microsoft.com/en-us/azure/storage/blobs/scalability-targets return 0.0 - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 0a7f80e37b31..79af7a6df8ad 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -3,30 +3,12 @@ """ import logging -import multiprocessing import os import os.path -import shutil -import subprocess -from datetime import datetime -from typing import Optional - -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) -from galaxy.util import ( - directory_hash_id, - safe_relpath, - umask_fix_perms, - unlink, -) -from . import ConcreteObjectStore -from .caching import ( - CacheTarget, - enable_cache_monitor, - InProcessCacheMonitor, -) + +from ._caching_base import CachingConcreteObjectStore +from ._util import UsesAxel +from .caching import enable_cache_monitor from .s3 import parse_config_xml try: @@ -47,36 +29,17 @@ ) -class CloudConfigMixin: - def _config_to_dict(self): - return { - "provider": self.provider, - "auth": self.credentials, - "bucket": { - "name": self.bucket_name, - "use_reduced_redundancy": self.use_rr, - }, - "cache": { - "size": self.cache_size, - "path": self.staging_path, - "cache_updated_data": self.cache_updated_data, - }, - } - - -class Cloud(ConcreteObjectStore, CloudConfigMixin): +class Cloud(CachingConcreteObjectStore, UsesAxel): """ Object store that stores objects as items in an cloud storage. A local cache exists that is used as an intermediate location for files between Galaxy and the cloud storage. """ - cache_monitor: Optional[InProcessCacheMonitor] = None store_type = "cloud" def __init__(self, config, config_dict): super().__init__(config, config_dict) - self.transfer_progress = 0 bucket_dict = config_dict["bucket"] cache_dict = config_dict.get("cache") or {} @@ -100,17 +63,9 @@ def _initialize(self): self.conn = self._get_connection(self.provider, self.credentials) self.bucket = self._get_bucket(self.bucket_name) - self.start_cache_monitor() - # Test if 'axel' is available for parallel download and pull the key into cache - try: - subprocess.call("axel") - self.use_axel = True - except OSError: - self.use_axel = False - - def start_cache_monitor(self): - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + self._ensure_staging_path_writable() + self._start_cache_monitor_if_needed() + self._init_axel() @staticmethod def _get_connection(provider, credentials): @@ -235,13 +190,20 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) + def _config_to_dict(self): + return { + "provider": self.provider, + "auth": self.credentials, + "bucket": { + "name": self.bucket_name, + "use_reduced_redundancy": self.use_rr, + }, + "cache": { + "size": self.cache_size, + "path": self.staging_path, + "cache_updated_data": self.cache_updated_data, + }, + } def _get_bucket(self, bucket_name): try: @@ -260,75 +222,7 @@ def _get_bucket(self, bucket_name): log.exception(f"Could not get bucket '{bucket_name}'") raise Exception - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # S3 folders are marked by having trailing '/' so add it now - rel_path = f"{rel_path}/" - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - - def _get_transfer_progress(self): - return self.transfer_progress - - def _get_size_in_cloud(self, rel_path): + def _get_remote_size(self, rel_path): try: obj = self.bucket.objects.get(rel_path) return obj.size @@ -336,7 +230,7 @@ def _get_size_in_cloud(self, rel_path): log.exception("Could not get size of key '%s' from S3", rel_path) return -1 - def _key_exists(self, rel_path): + def _exists_remotely(self, rel_path): exists = False try: # A hackish way of testing if the rel_path is a folder vs a file @@ -354,322 +248,87 @@ def _key_exists(self, rel_path): return False return exists - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - # log.debug("------ Checking cache for rel_path %s" % rel_path) - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) - return file_ok - - def _transfer_cb(self, complete, total): - self.transfer_progress += 10 - def _download(self, rel_path): + local_destination = self._get_cache_path(rel_path) try: - log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) key = self.bucket.objects.get(rel_path) - # Test if cache is large enough to hold the new file - if not self.cache_target.fits_in_cache(key.size): - log.critical( - "File %s is larger (%s) than the configured cache allows (%s). Cannot download.", - rel_path, - key.size, - self.cache_target.log_description, - ) + remote_size = key.size + if not self._caching_allowed(rel_path, remote_size): return False - if self.use_axel: - log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) - ncores = multiprocessing.cpu_count() - url = key.generate_url(7200) - ret_code = subprocess.call(f"axel -a -n {ncores} '{url}'") - if ret_code == 0: - return True - else: - log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) - self.transfer_progress = 0 # Reset transfer progress counter - with open(self._get_cache_path(rel_path), "wb+") as downloaded_file_handle: - key.save_content(downloaded_file_handle) - return True + log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) + self._download_to(key, local_destination) + return True except Exception: log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name) return False - def _push_to_os(self, rel_path, source_file=None, from_string=None): - """ - Push the file pointed to by ``rel_path`` to the object store naming the key - ``rel_path``. If ``source_file`` is provided, push that file instead while - still using ``rel_path`` as the key name. - If ``from_string`` is provided, set contents of the file to the value of - the string. - """ + def _download_directory_into_cache(self, rel_path, cache_path): + # List objects in the specified cloud folder + objects = self.bucket.objects.list(prefix=rel_path) + + for obj in objects: + remote_file_path = obj.name + local_file_path = os.path.join(cache_path, os.path.relpath(remote_file_path, rel_path)) + + # Create directories if they don't exist + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) + + # Download the file + self._download_to(obj, local_file_path) + + def _download_to(self, key, local_destination): + if self.use_axel: + url = key.generate_url(7200) + return self._axel_download(url, local_destination) + else: + with open(local_destination, "wb+") as downloaded_file_handle: + key.save_content(downloaded_file_handle) + + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: try: - source_file = source_file if source_file else self._get_cache_path(rel_path) - if os.path.exists(source_file): - if os.path.getsize(source_file) == 0 and (self.bucket.objects.get(rel_path) is not None): - log.debug( - "Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file, rel_path - ) - return True - if from_string: - if not self.bucket.objects.get(rel_path): - created_obj = self.bucket.objects.create(rel_path) - created_obj.upload(source_file) - else: - self.bucket.objects.get(rel_path).upload(source_file) - log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path) - else: - start_time = datetime.now() - log.debug( - "Pushing cache file '%s' of size %s bytes to key '%s'", - source_file, - os.path.getsize(source_file), - rel_path, - ) - self.transfer_progress = 0 # Reset transfer progress counter - if not self.bucket.objects.get(rel_path): - created_obj = self.bucket.objects.create(rel_path) - created_obj.upload_from_file(source_file) - else: - self.bucket.objects.get(rel_path).upload_from_file(source_file) - - end_time = datetime.now() - log.debug( - "Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)", - source_file, - rel_path, - os.path.getsize(source_file), - end_time - start_time, - ) - return True + if not self.bucket.objects.get(rel_path): + created_obj = self.bucket.objects.create(rel_path) + created_obj.upload(from_string) else: - log.error( - "Tried updating key '%s' from source file '%s', but source file does not exist.", - rel_path, - source_file, - ) + self.bucket.objects.get(rel_path).upload(from_string) + return True except Exception: - log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) - return False - - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_cloud(rel_path): - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self._get_size_in_cloud(rel_path), - ) - return False + log.exception("Trouble pushing to cloud '%s' from string", rel_path) + return False - def _exists(self, obj, **kwargs): - in_cache = False - rel_path = self._construct_path(obj, **kwargs) - - # Check cache - if self._in_cache(rel_path): - in_cache = True - # Check cloud - in_cloud = self._key_exists(rel_path) - # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) - # dir_only does not get synced so shortcut the decision - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - if dir_only: - if in_cache or in_cloud: - return True - # for JOB_WORK directory - elif base_dir: - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - return True + def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: + try: + if not self.bucket.objects.get(rel_path): + created_obj = self.bucket.objects.create(rel_path) + created_obj.upload_from_file(source_file) else: - return False - - # TODO: Sync should probably not be done here. Add this to an async upload stack? - if in_cache and not in_cloud: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) - return True - elif in_cloud: + self.bucket.objects.get(rel_path).upload_from_file(source_file) return True - else: + except Exception: + log.exception("Trouble pushing to cloud '%s' from file '%s'", rel_path, source_file) return False - def _create(self, obj, **kwargs): - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") - return self - - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) == 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - - def _size(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try cloud. Error: %s", rel_path, ex) - elif self._exists(obj, **kwargs): - return self._get_size_in_cloud(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - return 0 - - def _delete(self, obj, entire_dir=False, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - extra_dir = kwargs.get("extra_dir", None) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) + def _delete_remote_all(self, rel_path: str) -> bool: try: - # Remove temparory data in JOB_WORK directory - if base_dir and dir_only and obj_dir: - shutil.rmtree(os.path.abspath(rel_path)) - return True - - # For the case of extra_files, because we don't have a reference to - # individual files/keys we need to remove the entire directory structure - # with all the files in it. This is easy for the local file system, - # but requires iterating through each individual key in S3 and deleing it. - if entire_dir and extra_dir: - shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - results = self.bucket.objects.list(prefix=rel_path) - for key in results: - log.debug("Deleting key %s", key.name) - key.delete() - return True - else: - # Delete from cache first - unlink(self._get_cache_path(rel_path), ignore_errors=True) - # Delete from S3 as well - if self._key_exists(rel_path): - key = self.bucket.objects.get(rel_path) - log.debug("Deleting key %s", key.name) - key.delete() - return True + results = self.bucket.objects.list(prefix=rel_path) + for key in results: + log.debug("Deleting key %s", key.name) + key.delete() + return True except Exception: log.exception("Could not delete key '%s' from cloud", rel_path) - except OSError: - log.exception("%s delete error", self._get_filename(obj, **kwargs)) - return False + return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path): - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - - def _get_filename(self, obj, **kwargs): - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - rel_path = self._construct_path(obj, **kwargs) - sync_cache = kwargs.get("sync_cache", True) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # S3 does not recognize directories as files so cannot check if those exist. - # So, if checking dir only, ensure given dir exists in cache and return - # the expected cache path. - # dir_only = kwargs.get('dir_only', False) - # if dir_only: - # if not os.path.exists(cache_path): - # os.makedirs(cache_path) - # return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: # Directories do not get pulled into cache - return cache_path - else: - if self._pull_into_cache(rel_path): - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path - - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - if create: - self._create(obj, **kwargs) - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Chose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - # Update the file on cloud - self._push_to_os(rel_path, source_file) - else: - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") + def _delete_existing_remote(self, rel_path: str) -> bool: + try: + key = self.bucket.objects.get(rel_path) + log.debug("Deleting key %s", key.name) + key.delete() + return True + except Exception: + log.exception("Could not delete key '%s' from cloud", rel_path) + return False def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): @@ -685,4 +344,4 @@ def _get_store_usage_percent(self, obj): return 0.0 def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() diff --git a/lib/galaxy/objectstore/examples/__init__.py b/lib/galaxy/objectstore/examples/__init__.py new file mode 100644 index 000000000000..42b05b2d6eb2 --- /dev/null +++ b/lib/galaxy/objectstore/examples/__init__.py @@ -0,0 +1,5 @@ +from galaxy.util.resources import resource_string + + +def get_example(filename: str) -> str: + return resource_string("galaxy.objectstore.examples", filename) diff --git a/lib/galaxy/objectstore/examples/aws_s3_integration_test.yml b/lib/galaxy/objectstore/examples/aws_s3_integration_test.yml new file mode 100644 index 000000000000..da3bd8ae3e04 --- /dev/null +++ b/lib/galaxy/objectstore/examples/aws_s3_integration_test.yml @@ -0,0 +1,17 @@ +type: aws_s3 +store_by: uuid +auth: + access_key: ${GALAXY_TEST_AWS_ACCESS_KEY} + secret_key: ${GALAXY_TEST_AWS_SECRET_KEY} + +bucket: + name: ${GALAXY_TEST_AWS_BUCKET} + +connection: + region: ${GALAXY_TEST_AWS_REGION} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/azure_default_cache.xml b/lib/galaxy/objectstore/examples/azure_default_cache.xml new file mode 100644 index 000000000000..c9b95b72f62b --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_default_cache.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/lib/galaxy/objectstore/examples/azure_default_cache.yml b/lib/galaxy/objectstore/examples/azure_default_cache.yml new file mode 100644 index 000000000000..8f1eb80e22af --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_default_cache.yml @@ -0,0 +1,14 @@ +type: azure_blob +auth: + account_name: azureact + account_key: password123 + +container: + name: unique_container_name + max_chunk_size: 250 + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/azure_integration_test.yml b/lib/galaxy/objectstore/examples/azure_integration_test.yml new file mode 100644 index 000000000000..d44544856a2c --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_integration_test.yml @@ -0,0 +1,14 @@ +type: azure_blob +store_by: uuid +auth: + account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME} + account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY} + +container: + name: ${GALAXY_TEST_AZURE_CONTAINER_NAME} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/azure_integration_test_distributed.yml b/lib/galaxy/objectstore/examples/azure_integration_test_distributed.yml new file mode 100644 index 000000000000..d8de569819a6 --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_integration_test_distributed.yml @@ -0,0 +1,38 @@ +type: distributed +backends: +- type: azure_blob + id: azure1 + store_by: uuid + name: Azure Store 1 + allow_selection: true + weight: 1 + auth: + account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME} + account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY} + + container: + name: ${GALAXY_TEST_AZURE_CONTAINER_NAME} + + extra_dirs: + - type: job_work + path: database/job_working_directory_azure_1 + - type: temp + path: database/tmp_azure_1 +- type: azure_blob + id: azure2 + store_by: uuid + name: Azure Store 2 + allow_selection: true + weight: 1 + auth: + account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME} + account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY} + + container: + name: ${GALAXY_TEST_AZURE_CONTAINER_NAME} + + extra_dirs: + - type: job_work + path: database/job_working_directory_azure_2 + - type: temp + path: database/tmp_azure_2 diff --git a/lib/galaxy/objectstore/examples/azure_integration_test_with_account_url.yml b/lib/galaxy/objectstore/examples/azure_integration_test_with_account_url.yml new file mode 100644 index 000000000000..e8cfcee2ecf9 --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_integration_test_with_account_url.yml @@ -0,0 +1,15 @@ +type: azure_blob +store_by: uuid +auth: + account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME} + account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY} + account_url: ${GALAXY_TEST_AZURE_ACCOUNT_URL} + +container: + name: ${GALAXY_TEST_AZURE_CONTAINER_NAME} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/azure_simple.xml b/lib/galaxy/objectstore/examples/azure_simple.xml new file mode 100644 index 000000000000..aae420d633a6 --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_simple.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/lib/galaxy/objectstore/examples/azure_simple.yml b/lib/galaxy/objectstore/examples/azure_simple.yml new file mode 100644 index 000000000000..b2a57ed099fe --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_simple.yml @@ -0,0 +1,17 @@ +type: azure_blob +auth: + account_name: azureact + account_key: password123 + +container: + name: unique_container_name + +cache: + path: database/object_store_cache + size: 100 + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/azure_transfer.xml b/lib/galaxy/objectstore/examples/azure_transfer.xml new file mode 100644 index 000000000000..bab8b2913c63 --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_transfer.xml @@ -0,0 +1,13 @@ + + + + + + + + diff --git a/lib/galaxy/objectstore/examples/azure_transfer.yml b/lib/galaxy/objectstore/examples/azure_transfer.yml new file mode 100644 index 000000000000..2e29f85081db --- /dev/null +++ b/lib/galaxy/objectstore/examples/azure_transfer.yml @@ -0,0 +1,24 @@ +type: azure_blob +auth: + account_name: azureact + account_key: password123 + +container: + name: unique_container_name + +cache: + path: database/object_store_cache + size: 100 + +transfer: + download_max_concurrency: 1 + upload_max_concurrency: 2 + max_single_put_size: 10 + max_single_get_size: 20 + max_block_size: 3 + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/boto3_custom_connection.xml b/lib/galaxy/objectstore/examples/boto3_custom_connection.xml new file mode 100644 index 000000000000..7256007d9ab1 --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_custom_connection.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/lib/galaxy/objectstore/examples/boto3_custom_connection.yml b/lib/galaxy/objectstore/examples/boto3_custom_connection.yml new file mode 100644 index 000000000000..a4485dcc09ef --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_custom_connection.yml @@ -0,0 +1,21 @@ +type: boto3 +auth: + access_key: access_moo + secret_key: secret_cow + +bucket: + name: unique_bucket_name_all_lowercase + +cache: + path: database/object_store_cache + size: 1000 + +connection: + endpoint_url: https://s3.example.org/ + region: the_example_region + +extra_dirs: +- type: job_work + path: database/job_working_directory_s3 +- type: temp + path: database/tmp_s3 diff --git a/lib/galaxy/objectstore/examples/boto3_integration_test_aws.yml b/lib/galaxy/objectstore/examples/boto3_integration_test_aws.yml new file mode 100644 index 000000000000..e3fe42ca027c --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_integration_test_aws.yml @@ -0,0 +1,14 @@ +type: boto3 +store_by: uuid +auth: + access_key: ${GALAXY_TEST_AWS_ACCESS_KEY} + secret_key: ${GALAXY_TEST_AWS_SECRET_KEY} + +bucket: + name: ${GALAXY_TEST_AWS_BUCKET} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/boto3_integration_test_aws_new_bucket.yml b/lib/galaxy/objectstore/examples/boto3_integration_test_aws_new_bucket.yml new file mode 100644 index 000000000000..9a5ccded34aa --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_integration_test_aws_new_bucket.yml @@ -0,0 +1,14 @@ +type: boto3 +store_by: uuid +auth: + access_key: ${GALAXY_TEST_AWS_ACCESS_KEY} + secret_key: ${GALAXY_TEST_AWS_SECRET_KEY} + +bucket: + name: mycoolbucket${test_random_int} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/boto3_integration_test_multithreaded.yml b/lib/galaxy/objectstore/examples/boto3_integration_test_multithreaded.yml new file mode 100644 index 000000000000..fc4932e34700 --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_integration_test_multithreaded.yml @@ -0,0 +1,17 @@ +type: boto3 +store_by: uuid +auth: + access_key: ${GALAXY_TEST_AWS_ACCESS_KEY} + secret_key: ${GALAXY_TEST_AWS_SECRET_KEY} + +bucket: + name: ${GALAXY_TEST_AWS_BUCKET} + +transfer: + multipart_threshold: 10 + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.xml b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.xml new file mode 100644 index 000000000000..791e9456607c --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.xml @@ -0,0 +1,16 @@ + + + + + + + + diff --git a/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.yml b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.yml new file mode 100644 index 000000000000..3e84dd15c191 --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.yml @@ -0,0 +1,27 @@ +type: boto3 +auth: + access_key: access_moo + secret_key: secret_cow + +bucket: + name: unique_bucket_name_all_lowercase + +cache: + path: database/object_store_cache + size: 1000 + +transfer: + multipart_threshold: 13 + max_concurrency: 13 + multipart_chunksize: 13 + num_download_attempts: 13 + max_io_queue: 13 + io_chunksize: 13 + use_threads: false + max_bandwidth: 13 + +extra_dirs: +- type: job_work + path: database/job_working_directory_s3 +- type: temp + path: database/tmp_s3 diff --git a/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.xml b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.xml new file mode 100644 index 000000000000..bec5184c3e00 --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.xml @@ -0,0 +1,24 @@ + + + + + + + + diff --git a/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.yml b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.yml new file mode 100644 index 000000000000..044416e5f2bf --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.yml @@ -0,0 +1,35 @@ +type: boto3 +auth: + access_key: access_moo + secret_key: secret_cow + +bucket: + name: unique_bucket_name_all_lowercase + +cache: + path: database/object_store_cache + size: 1000 + +transfer: + upload_multipart_threshold: 13 + upload_max_concurrency: 13 + upload_multipart_chunksize: 13 + upload_num_download_attempts: 13 + upload_max_io_queue: 13 + upload_io_chunksize: 13 + upload_use_threads: false + upload_max_bandwidth: 13 + download_multipart_threshold: 14 + download_max_concurrency: 14 + download_multipart_chunksize: 14 + download_num_download_attempts: 14 + download_max_io_queue: 14 + download_io_chunksize: 14 + download_use_threads: true + download_max_bandwidth: 14 + +extra_dirs: +- type: job_work + path: database/job_working_directory_s3 +- type: temp + path: database/tmp_s3 diff --git a/lib/galaxy/objectstore/examples/boto3_simple.xml b/lib/galaxy/objectstore/examples/boto3_simple.xml new file mode 100644 index 000000000000..c145405d7689 --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_simple.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/lib/galaxy/objectstore/examples/boto3_simple.yml b/lib/galaxy/objectstore/examples/boto3_simple.yml new file mode 100644 index 000000000000..8e74986694b5 --- /dev/null +++ b/lib/galaxy/objectstore/examples/boto3_simple.yml @@ -0,0 +1,17 @@ +type: boto3 +auth: + access_key: access_moo + secret_key: secret_cow + +bucket: + name: unique_bucket_name_all_lowercase + +cache: + path: database/object_store_cache + size: 1000 + +extra_dirs: +- type: job_work + path: database/job_working_directory_s3 +- type: temp + path: database/tmp_s3 diff --git a/lib/galaxy/objectstore/examples/cloud_aws_default_cache.xml b/lib/galaxy/objectstore/examples/cloud_aws_default_cache.xml new file mode 100644 index 000000000000..4479fe70f8c1 --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_aws_default_cache.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/lib/galaxy/objectstore/examples/cloud_aws_no_auth.xml b/lib/galaxy/objectstore/examples/cloud_aws_no_auth.xml new file mode 100644 index 000000000000..9361987322ee --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_aws_no_auth.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/lib/galaxy/objectstore/examples/cloud_aws_simple.xml b/lib/galaxy/objectstore/examples/cloud_aws_simple.xml new file mode 100644 index 000000000000..dc22faa2ea54 --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_aws_simple.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/lib/galaxy/objectstore/examples/cloud_aws_simple.yml b/lib/galaxy/objectstore/examples/cloud_aws_simple.yml new file mode 100644 index 000000000000..1f12ee10402c --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_aws_simple.yml @@ -0,0 +1,19 @@ +type: cloud +provider: aws +auth: + access_key: access_moo + secret_key: secret_cow + +bucket: + name: unique_bucket_name_all_lowercase + use_reduced_redundancy: false + +cache: + path: database/object_store_cache + size: 1000 + +extra_dirs: +- type: job_work + path: database/job_working_directory_cloud +- type: temp + path: database/tmp_cloud diff --git a/lib/galaxy/objectstore/examples/cloud_azure_simple.xml b/lib/galaxy/objectstore/examples/cloud_azure_simple.xml new file mode 100644 index 000000000000..4f69940bf371 --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_azure_simple.xml @@ -0,0 +1,8 @@ + + + + + + + diff --git a/lib/galaxy/objectstore/examples/cloud_azure_simple.yml b/lib/galaxy/objectstore/examples/cloud_azure_simple.yml new file mode 100644 index 000000000000..abd0c87d9eab --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_azure_simple.yml @@ -0,0 +1,21 @@ +type: cloud +provider: azure +auth: + subscription_id: a_sub_id + client_id: and_a_client_id + secret: and_a_secret_key + tenant: and_some_tenant_info + +bucket: + name: unique_bucket_name_all_lowercase + use_reduced_redundancy: false + +cache: + path: database/object_store_cache + size: 1000 + +extra_dirs: +- type: job_work + path: database/job_working_directory_cloud +- type: temp + path: database/tmp_cloud diff --git a/lib/galaxy/objectstore/examples/cloud_gcp_simple.xml b/lib/galaxy/objectstore/examples/cloud_gcp_simple.xml new file mode 100644 index 000000000000..9fcb683685cf --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_gcp_simple.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/lib/galaxy/objectstore/examples/cloud_gcp_simple.yml b/lib/galaxy/objectstore/examples/cloud_gcp_simple.yml new file mode 100644 index 000000000000..c6cfa193291b --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_gcp_simple.yml @@ -0,0 +1,18 @@ +type: cloud +provider: google +auth: + credentials_file: gcp.config + +bucket: + name: unique_bucket_name_all_lowercase + use_reduced_redundancy: false + +cache: + path: database/object_store_cache + size: 1000 + +extra_dirs: +- type: job_work + path: database/job_working_directory_cloud +- type: temp + path: database/tmp_cloud diff --git a/lib/galaxy/objectstore/examples/cloud_integration_test_aws.yml b/lib/galaxy/objectstore/examples/cloud_integration_test_aws.yml new file mode 100644 index 000000000000..f48047c190cd --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_integration_test_aws.yml @@ -0,0 +1,15 @@ +type: cloud +store_by: uuid +provider: aws +auth: + access_key: ${GALAXY_TEST_AWS_ACCESS_KEY} + secret_key: ${GALAXY_TEST_AWS_SECRET_KEY} + +bucket: + name: ${GALAXY_TEST_AWS_BUCKET} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/cloud_integration_test_aws_with_region.yml b/lib/galaxy/objectstore/examples/cloud_integration_test_aws_with_region.yml new file mode 100644 index 000000000000..135429e7029a --- /dev/null +++ b/lib/galaxy/objectstore/examples/cloud_integration_test_aws_with_region.yml @@ -0,0 +1,16 @@ +type: cloud +store_by: uuid +provider: aws +auth: + access_key: ${GALAXY_TEST_AWS_ACCESS_KEY} + secret_key: ${GALAXY_TEST_AWS_SECRET_KEY} + region: ${GALAXY_TEST_AWS_REGION} + +bucket: + name: ${GALAXY_TEST_AWS_BUCKET} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/disk_badges.xml b/lib/galaxy/objectstore/examples/disk_badges.xml new file mode 100644 index 000000000000..6dc1f3a837ad --- /dev/null +++ b/lib/galaxy/objectstore/examples/disk_badges.xml @@ -0,0 +1,13 @@ + + + + + + + + Fast interconnects. + + + Storage is backed up to tape nightly. + + diff --git a/lib/galaxy/objectstore/examples/disk_badges.yml b/lib/galaxy/objectstore/examples/disk_badges.yml new file mode 100644 index 000000000000..5d0c23126dba --- /dev/null +++ b/lib/galaxy/objectstore/examples/disk_badges.yml @@ -0,0 +1,16 @@ +type: disk +files_dir: "${temp_directory}/files1" +store_by: uuid +extra_dirs: + - type: temp + path: "${temp_directory}/tmp1" + - type: job_work + path: "${temp_directory}/job_working_directory1" +badges: + - type: short_term + - type: faster + message: Fast interconnects. + - type: less_stable + - type: more_secure + - type: backed_up + message: Storage is backed up to tape nightly. diff --git a/lib/galaxy/objectstore/examples/distributed_disk.xml b/lib/galaxy/objectstore/examples/distributed_disk.xml new file mode 100644 index 000000000000..322e414e6041 --- /dev/null +++ b/lib/galaxy/objectstore/examples/distributed_disk.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/lib/galaxy/objectstore/examples/distributed_disk.yml b/lib/galaxy/objectstore/examples/distributed_disk.yml new file mode 100644 index 000000000000..080d01f2c104 --- /dev/null +++ b/lib/galaxy/objectstore/examples/distributed_disk.yml @@ -0,0 +1,26 @@ +type: distributed +backends: + - id: files1 + quota: + source: 1files + type: disk + weight: 2 + device: primary_disk + files_dir: "${temp_directory}/files1" + extra_dirs: + - type: temp + path: "${temp_directory}/tmp1" + - type: job_work + path: "${temp_directory}/job_working_directory1" + - id: files2 + quota: + source: 2files + type: disk + weight: 1 + device: primary_disk + files_dir: "${temp_directory}/files2" + extra_dirs: + - type: temp + path: "${temp_directory}/tmp2" + - type: job_work + path: "${temp_directory}/job_working_directory2" diff --git a/lib/galaxy/objectstore/examples/distributed_s3.yml b/lib/galaxy/objectstore/examples/distributed_s3.yml new file mode 100644 index 000000000000..ec73647b9454 --- /dev/null +++ b/lib/galaxy/objectstore/examples/distributed_s3.yml @@ -0,0 +1,34 @@ +type: distributed +backends: + - id: files1 + weight: 1 + type: s3 + auth: + access_key: access_moo + secret_key: secret_cow + + bucket: + name: unique_bucket_name_all_lowercase + use_reduced_redundancy: false + + extra_dirs: + - type: job_work + path: ${temp_directory}/job_working_directory_s3 + - type: temp + path: ${temp_directory}/tmp_s3 + - id: files2 + weight: 1 + type: s3 + auth: + access_key: access_moo + secret_key: secret_cow + + bucket: + name: unique_bucket_name_all_lowercase_2 + use_reduced_redundancy: false + + extra_dirs: + - type: job_work + path: ${temp_directory}/job_working_directory_s3_2 + - type: temp + path: ${temp_directory}/tmp_s3_2 diff --git a/lib/galaxy/objectstore/examples/gcp_boto3_integration_test.yml b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test.yml new file mode 100644 index 000000000000..a8ce3e2ce3c1 --- /dev/null +++ b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test.yml @@ -0,0 +1,17 @@ +type: boto3 +store_by: uuid +auth: + access_key: ${GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY} + secret_key: ${GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY} + +bucket: + name: ${GALAXY_TEST_GOOGLE_BUCKET} + +connection: + endpoint_url: https://storage.googleapis.com + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/gcp_boto3_integration_test_legacy_params.yml b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test_legacy_params.yml new file mode 100644 index 000000000000..04c342cb83e8 --- /dev/null +++ b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test_legacy_params.yml @@ -0,0 +1,20 @@ +type: boto3 +store_by: uuid +auth: + access_key: ${GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY} + secret_key: ${GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY} + +bucket: + name: ${GALAXY_TEST_GOOGLE_BUCKET} + +connection: + host: storage.googleapis.com + port: 443 + secure: true + conn_pat: '/' + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/gcp_s3_integration_test.yml b/lib/galaxy/objectstore/examples/gcp_s3_integration_test.yml new file mode 100644 index 000000000000..44d071778533 --- /dev/null +++ b/lib/galaxy/objectstore/examples/gcp_s3_integration_test.yml @@ -0,0 +1,18 @@ +type: generic_s3 +store_by: uuid +auth: + access_key: ${GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY} + secret_key: ${GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY} + +bucket: + name: ${GALAXY_TEST_GOOGLE_BUCKET} + +connection: + host: storage.googleapis.com + port: 443 + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure diff --git a/lib/galaxy/objectstore/examples/hierarchical_simple.xml b/lib/galaxy/objectstore/examples/hierarchical_simple.xml new file mode 100644 index 000000000000..2e8ec1051257 --- /dev/null +++ b/lib/galaxy/objectstore/examples/hierarchical_simple.xml @@ -0,0 +1,23 @@ + + + + + + This is our new storage cluster, check out the storage + on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage). + + + + + + + + This is our older legacy storage cluster, check out the storage + on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage). + + + + + + + diff --git a/lib/galaxy/objectstore/examples/hierarchical_simple.yml b/lib/galaxy/objectstore/examples/hierarchical_simple.yml new file mode 100644 index 000000000000..1755b5c82099 --- /dev/null +++ b/lib/galaxy/objectstore/examples/hierarchical_simple.yml @@ -0,0 +1,28 @@ +type: hierarchical +backends: + - id: files1 + name: Newer Cool Storage + description: | + This is our new storage cluster, check out the storage + on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage). + type: disk + weight: 1 + files_dir: "${temp_directory}/files1" + extra_dirs: + - type: temp + path: "${temp_directory}/tmp1" + - type: job_work + path: "${temp_directory}/job_working_directory1" + - id: files2 + name: Older Legacy Storage + description: | + This is our older legacy storage cluster, check out the storage + on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage). + type: disk + weight: 1 + files_dir: "${temp_directory}/files2" + extra_dirs: + - type: temp + path: "${temp_directory}/tmp2" + - type: job_work + path: "${temp_directory}/job_working_directory2" diff --git a/lib/galaxy/objectstore/examples/pithos_simple.xml b/lib/galaxy/objectstore/examples/pithos_simple.xml new file mode 100644 index 000000000000..d7a5c30f11b1 --- /dev/null +++ b/lib/galaxy/objectstore/examples/pithos_simple.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/lib/galaxy/objectstore/examples/pithos_simple.yml b/lib/galaxy/objectstore/examples/pithos_simple.yml new file mode 100644 index 000000000000..86bd6c2cc965 --- /dev/null +++ b/lib/galaxy/objectstore/examples/pithos_simple.yml @@ -0,0 +1,14 @@ +type: pithos +auth: + url: http://example.org/ + token: extoken123 + +container: + name: foo + project: cow + +extra_dirs: + - type: temp + path: database/tmp_pithos + - type: job_work + path: database/working_pithos diff --git a/lib/galaxy/objectstore/examples/s3_global_cache.xml b/lib/galaxy/objectstore/examples/s3_global_cache.xml new file mode 100644 index 000000000000..ec1a4e40ea4c --- /dev/null +++ b/lib/galaxy/objectstore/examples/s3_global_cache.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/lib/galaxy/objectstore/examples/s3_global_cache.yml b/lib/galaxy/objectstore/examples/s3_global_cache.yml new file mode 100644 index 000000000000..de08a7142f43 --- /dev/null +++ b/lib/galaxy/objectstore/examples/s3_global_cache.yml @@ -0,0 +1,15 @@ +type: s3 +private: true +auth: + access_key: access_moo + secret_key: secret_cow + +bucket: + name: unique_bucket_name_all_lowercase + use_reduced_redundancy: false + +extra_dirs: +- type: job_work + path: database/job_working_directory_s3 +- type: temp + path: database/tmp_s3 diff --git a/lib/galaxy/objectstore/examples/s3_simple.xml b/lib/galaxy/objectstore/examples/s3_simple.xml new file mode 100644 index 000000000000..c64c618021f3 --- /dev/null +++ b/lib/galaxy/objectstore/examples/s3_simple.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/lib/galaxy/objectstore/examples/s3_simple.yml b/lib/galaxy/objectstore/examples/s3_simple.yml new file mode 100644 index 000000000000..b56a251b9c29 --- /dev/null +++ b/lib/galaxy/objectstore/examples/s3_simple.yml @@ -0,0 +1,19 @@ +type: s3 +private: true +auth: + access_key: access_moo + secret_key: secret_cow + +bucket: + name: unique_bucket_name_all_lowercase + use_reduced_redundancy: false + +cache: + path: database/object_store_cache + size: 1000 + +extra_dirs: +- type: job_work + path: database/job_working_directory_s3 +- type: temp + path: database/tmp_s3 diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index bb0bdead62b2..9241c1efe75c 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -20,19 +20,12 @@ except ImportError: irods = None -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) from galaxy.util import ( - directory_hash_id, ExecutionTimer, string_as_bool, - umask_fix_perms, unlink, ) -from galaxy.util.path import safe_relpath -from . import DiskObjectStore +from ._caching_base import CachingConcreteObjectStore IRODS_IMPORT_MESSAGE = "The Python irods package is required to use this feature, please install it" # 1 MB @@ -115,7 +108,7 @@ def parse_config_xml(config_xml): "cache_updated_data": cache_updated_data, }, "extra_dirs": extra_dirs, - "private": DiskObjectStore.parse_private_from_config_xml(config_xml), + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), } except Exception: # Toss it back up after logging, we can't continue loading at this point. @@ -123,35 +116,7 @@ def parse_config_xml(config_xml): raise -class CloudConfigMixin: - def _config_to_dict(self): - return { - "auth": { - "username": self.username, - "password": self.password, - }, - "resource": { - "name": self.resource, - }, - "zone": { - "name": self.zone, - }, - "connection": { - "host": self.host, - "port": self.port, - "timeout": self.timeout, - "refresh_time": self.refresh_time, - "connection_pool_monitor_interval": self.connection_pool_monitor_interval, - }, - "cache": { - "size": self.cache_size, - "path": self.staging_path, - "cache_updated_data": self.cache_updated_data, - }, - } - - -class IRODSObjectStore(DiskObjectStore, CloudConfigMixin): +class IRODSObjectStore(CachingConcreteObjectStore): """ Object store that stores files as data objects in an iRODS Zone. A local cache exists that is used as an intermediate location for files between Galaxy and iRODS. @@ -314,73 +279,34 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - ipt_timer = ExecutionTimer() - # extra_dir should never be constructed from provided data but just - # make sure there are no shenanigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - log.debug("irods_pt _construct_path: %s", ipt_timer) - return os.path.join(base, rel_path) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - log.debug("irods_pt _construct_path: %s", ipt_timer) - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) + def _config_to_dict(self): + return { + "auth": { + "username": self.username, + "password": self.password, + }, + "resource": { + "name": self.resource, + }, + "zone": { + "name": self.zone, + }, + "connection": { + "host": self.host, + "port": self.port, + "timeout": self.timeout, + "refresh_time": self.refresh_time, + "connection_pool_monitor_interval": self.connection_pool_monitor_interval, + }, + "cache": { + "size": self.cache_size, + "path": self.staging_path, + "cache_updated_data": self.cache_updated_data, + }, + } # rel_path is file or folder? - def _get_size_in_irods(self, rel_path): + def _get_remote_size(self, rel_path): ipt_timer = ExecutionTimer() p = Path(rel_path) data_object_name = p.stem + p.suffix @@ -397,10 +323,10 @@ def _get_size_in_irods(self, rel_path): log.warning("Collection or data object (%s) does not exist", data_object_path) return -1 finally: - log.debug("irods_pt _get_size_in_irods: %s", ipt_timer) + log.debug("irods_pt _get_remote_size: %s", ipt_timer) # rel_path is file or folder? - def _data_object_exists(self, rel_path): + def _exists_remotely(self, rel_path): ipt_timer = ExecutionTimer() p = Path(rel_path) data_object_name = p.stem + p.suffix @@ -417,28 +343,12 @@ def _data_object_exists(self, rel_path): log.debug("Collection or data object (%s) does not exist", data_object_path) return False finally: - log.debug("irods_pt _data_object_exists: %s", ipt_timer) - - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - - def _pull_into_cache(self, rel_path): - ipt_timer = ExecutionTimer() - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) - log.debug("irods_pt _pull_into_cache: %s", ipt_timer) - return file_ok + log.debug("irods_pt _exists_remotely: %s", ipt_timer) def _download(self, rel_path): ipt_timer = ExecutionTimer() - log.debug("Pulling data object '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + cache_path = self._get_cache_path(rel_path) + log.debug("Pulling data object '%s' into cache to %s", rel_path, cache_path) p = Path(rel_path) data_object_name = p.stem + p.suffix @@ -452,7 +362,6 @@ def _download(self, rel_path): options = {kw.FORCE_FLAG_KW: "", kw.DEST_RESC_NAME_KW: self.resource} try: - cache_path = self._get_cache_path(rel_path) self.session.data_objects.get(data_object_path, cache_path, **options) log.debug("Pulled data object '%s' into cache to %s", rel_path, cache_path) return True @@ -462,7 +371,7 @@ def _download(self, rel_path): finally: log.debug("irods_pt _download: %s", ipt_timer) - def _push_to_irods(self, rel_path, source_file=None, from_string=None): + def _push_to_storage(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the iRODS. Extract folder name from rel_path as iRODS collection name, and extract file name from rel_path @@ -539,103 +448,7 @@ def _push_to_irods(self, rel_path, source_file=None, from_string=None): ) return True finally: - log.debug("irods_pt _push_to_irods: %s", ipt_timer) - - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_irods(rel_path): - log.debug("irods_pt _file_ready: %s", ipt_timer) - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self._get_size_in_irods(rel_path), - ) - log.debug("irods_pt _file_ready: %s", ipt_timer) - return False - - def _exists(self, obj, **kwargs): - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - - # Check cache and irods - if self._in_cache(rel_path) or self._data_object_exists(rel_path): - log.debug("irods_pt _exists: %s", ipt_timer) - return True - - # dir_only does not get synced so shortcut the decision - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - if dir_only and base_dir: - # for JOB_WORK directory - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - log.debug("irods_pt _exists: %s", ipt_timer) - return True - log.debug("irods_pt _exists: %s", ipt_timer) - return False - - def _create(self, obj, **kwargs): - ipt_timer = ExecutionTimer() - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_irods(rel_path, from_string="") - log.debug("irods_pt _create: %s", ipt_timer) - return self - - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) > 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - - def _size(self, obj, **kwargs) -> int: - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try iRODS. Error: %s", rel_path, ex) - finally: - log.debug("irods_pt _size: %s", ipt_timer) - elif self._exists(obj, **kwargs): - log.debug("irods_pt _size: %s", ipt_timer) - return self._get_size_in_irods(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - log.debug("irods_pt _size: %s", ipt_timer) - return 0 + log.debug("irods_pt _push_to_storage: %s", ipt_timer) def _delete(self, obj, entire_dir=False, **kwargs): ipt_timer = ExecutionTimer() @@ -705,92 +518,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.debug("irods_pt _delete: %s", ipt_timer) return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path): - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - log.debug("irods_pt _get_data: %s", ipt_timer) - return content - - def _get_filename(self, obj, **kwargs): - ipt_timer = ExecutionTimer() - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - rel_path = self._construct_path(obj, **kwargs) - sync_cache = kwargs.get("sync_cache", True) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - log.debug("irods_pt _get_filename: %s", ipt_timer) - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # iRODS does not recognize directories as files so cannot check if those exist. - # So, if checking dir only, ensure given dir exists in cache and return - # the expected cache path. - # dir_only = kwargs.get('dir_only', False) - # if dir_only: - # if not os.path.exists(cache_path): - # os.makedirs(cache_path) - # return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - log.debug("irods_pt _get_filename: %s", ipt_timer) - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: # Directories do not get pulled into cache - log.debug("irods_pt _get_filename: %s", ipt_timer) - return cache_path - else: - if self._pull_into_cache(rel_path): - log.debug("irods_pt _get_filename: %s", ipt_timer) - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - log.debug("irods_pt _get_filename: %s", ipt_timer) - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path - - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - ipt_timer = ExecutionTimer() - if create: - self._create(obj, **kwargs) - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Choose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - # Update the file on iRODS - self._push_to_irods(rel_path, source_file) - else: - log.debug("irods_pt _update_from_file: %s", ipt_timer) - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") - log.debug("irods_pt _update_from_file: %s", ipt_timer) - # Unlike S3, url is not really applicable to iRODS def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 60a710f1542d..43697062d9d0 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -4,7 +4,6 @@ import logging import os -import shutil try: from kamaki.clients import ( @@ -17,16 +16,8 @@ except ImportError: KamakiClient = None -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) -from galaxy.util import ( - directory_hash_id, - umask_fix_perms, -) -from galaxy.util.path import safe_relpath -from . import ConcreteObjectStore +from galaxy.util import directory_hash_id +from ._caching_base import CachingConcreteObjectStore NO_KAMAKI_ERROR_MESSAGE = ( "ObjectStore configured, but no kamaki.clients dependency available." @@ -77,7 +68,7 @@ def parse_config_xml(config_xml): log.error(msg) raise Exception(msg) r["extra_dirs"] = [{k: e.get(k) for k in attrs} for e in extra_dirs] - r["private"] = ConcreteObjectStore.parse_private_from_config_xml(config_xml) + r["private"] = CachingConcreteObjectStore.parse_private_from_config_xml(config_xml) if "job_work" not in (d["type"] for d in r["extra_dirs"]): msg = f'No value for {tag}:type="job_work" in XML tree' log.error(msg) @@ -88,7 +79,7 @@ def parse_config_xml(config_xml): return r -class PithosObjectStore(ConcreteObjectStore): +class PithosObjectStore(CachingConcreteObjectStore): """ Object store that stores objects as items in a Pithos+ container. Cache is ignored for the time being. @@ -101,7 +92,6 @@ def __init__(self, config, config_dict): self.staging_path = self.config.file_path log.info("Parse config_xml for pithos object store") self.config_dict = config_dict - log.debug(self.config_dict) self._initialize() @@ -109,6 +99,7 @@ def _initialize(self): if KamakiClient is None: raise Exception(NO_KAMAKI_ERROR_MESSAGE) + self._ensure_staging_path_writable() log.info("Authenticate Synnefo account") self._authenticate() log.info("Initialize Pithos+ client") @@ -152,91 +143,9 @@ def _init_pithos(self): if project and c.get("x-container-policy-project") != project: self.pithos.reassign_container(project) - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - """Construct path from object and parameters""" - # param extra_dir: should never be constructed from provided data but - # just make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning(f"extra_dir is not normalized: {extra_dir}") - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning(f"alt_name would locate path outside dir: {alt_name}") - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # Pithos+ folders are marked by having trailing '/' so add it now - rel_path = f"{rel_path}/" - - if not dir_only: - an = alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat" - rel_path = os.path.join(rel_path, an) - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if - so. - """ - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - rel_cache_path_dir = self._get_cache_path(rel_path_dir) - if not os.path.exists(rel_cache_path_dir): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - cache_path = self._get_cache_path(rel_path_dir) - self.pithos.download_object(rel_path, cache_path) - self._fix_permissions(cache_path) - return cache_path + def _download(self, rel_path): + local_destination = self._get_cache_path(rel_path) + self.pithos.download_object(rel_path, local_destination) # No need to overwrite "shutdown" @@ -305,27 +214,7 @@ def _create(self, obj, **kwargs): self.pithos.upload_from_string(rel_path, "") return self - def _empty(self, obj, **kwargs): - """ - :returns: weather the object has content - :raises ObjectNotFound: - """ - if not self._exists(obj, **kwargs): - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - return bool(self._size(obj, **kwargs)) - - def _size(self, obj, **kwargs) -> int: - """ - :returns: The size of the object, or 0 if it doesn't exist (sorry for - that, not our fault, the ObjectStore interface is like that some - times) - """ - path = self._construct_path(obj, **kwargs) - if self._in_cache(path): - try: - return os.path.getsize(self._get_cache_path(path)) - except OSError as ex: - log.warning("Could not get size of file %s in local cache, will try Pithos. Error: %s", path, ex) + def _get_remote_size(self, path): try: file = self.pithos.get_object_info(path) except ClientError as ce: @@ -334,96 +223,22 @@ def _size(self, obj, **kwargs) -> int: return 0 return int(file["content-length"]) - def _delete(self, obj, **kwargs): - """Delete the object - :returns: weather the object was deleted - """ - path = self._construct_path(obj, **kwargs) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) + def _delete_remote_all(self, path: str) -> bool: try: - if all((base_dir, dir_only, obj_dir)): - shutil.rmtree(os.path.abspath(path)) - return True - cache_path = self._get_cache_path(path) - - entire_dir = kwargs.get("entire_dir", False) - extra_dir = kwargs.get("extra_dir", False) - if entire_dir and extra_dir: - shutil.rmtree(cache_path) - log.debug(f"On Pithos: delete -r {path}/") - self.pithos.del_object(path, delimiter="/") - return True - else: - os.unlink(cache_path) - self.pithos.del_object(path) - except OSError: - log.exception(f"{self._get_filename(obj, **kwargs)} delete error") - except ClientError as ce: - log.exception(f"Could not delete {path} from Pithos, {ce}") - return False - - def _get_data(self, obj, start=0, count=-1, **kwargs): - """Fetch (e.g., download) data - :param start: Chunk of data starts here - :param count: Fetch at most as many data, fetch all if negative - """ - path = self._construct_path(obj, **kwargs) - if self._in_cache(path): - cache_path = self._pull_into_cache(path) - else: - cache_path = self._get_cache_path(path) - data_file = open(cache_path) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - - def _get_filename(self, obj, **kwargs): - """Get the expected filename with absolute path""" - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - path = self._construct_path(obj, **kwargs) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(path) - cache_path = self._get_cache_path(path) - if dir_only: - if not os.path.exists(cache_path): - os.makedirs(cache_path, exist_ok=True) - return cache_path - if self._in_cache(path): - return cache_path - elif self._exists(obj, **kwargs): - if not dir_only: - self._pull_into_cache(path) - return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - - def _update_from_file(self, obj, **kwargs): - """Update the store when a file is updated""" - if kwargs.get("create"): - self._create(obj, **kwargs) - if not self._exists(obj, **kwargs): - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") + log.debug(f"On Pithos: delete -r {path}/") + self.pithos.del_object(path, delimiter="/") + return True + except ClientError: + log.exception(f"Could not delete path '{path}' from Pithos") + return False - path = self._construct_path(obj, **kwargs) - cache_path = self._get_cache_path(path) - file_name = kwargs.get("file_name") - if file_name: - source_path = os.path.abspath(file_name) - try: - if source_path != cache_path: - shutil.copy2(source_path, cache_path) - self._fix_permissions(cache_path) - except OSError: - log.exception('Trouble copying source file "%s" to cache "%s"', source_path, cache_path) - else: - with open(cache_path) as f: - self.pithos.upload_object(obj, f) + def _delete_existing_remote(self, path: str) -> bool: + try: + self.pithos.del_object(path) + return True + except ClientError: + log.exception(f"Could not delete path '{path}' from Pithos") + return False def _get_object_url(self, obj, **kwargs): """ diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index 1d9c3d48b8d7..4bb6540a34de 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -2,7 +2,6 @@ import logging import os import shutil -from typing import Optional try: import rucio.common @@ -32,12 +31,9 @@ umask_fix_perms, unlink, ) -from galaxy.util.path import safe_relpath -from . import ConcreteObjectStore +from ._caching_base import CachingConcreteObjectStore from .caching import ( - CacheTarget, enable_cache_monitor, - InProcessCacheMonitor, parse_caching_config_dict_from_xml, ) @@ -273,7 +269,7 @@ def delete(self, key, auth_token): return True -class RucioObjectStore(ConcreteObjectStore): +class RucioObjectStore(CachingConcreteObjectStore): """ Object store implementation that uses ORNL remote data broker. @@ -281,8 +277,6 @@ class RucioObjectStore(ConcreteObjectStore): Galaxy at some future point or significantly modified. """ - cache_monitor: Optional[InProcessCacheMonitor] = None - store_type = "rucio" def to_dict(self): @@ -309,59 +303,8 @@ def __init__(self, config, config_dict): self._initialize() def _initialize(self): - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) - - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenanigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(str(base), rel_path) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) + self._ensure_staging_path_writable() + self._start_cache_monitor_if_needed() def _pull_into_cache(self, rel_path, auth_token): log.debug("rucio _pull_into_cache: %s", rel_path) @@ -414,25 +357,6 @@ def _exists(self, obj, **kwargs): def parse_xml(cls, config_xml): return parse_config_xml(config_xml) - def file_ready(self, obj, **kwargs): - log.debug("rucio file_ready") - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self.rucio_broker.get_size(rel_path): - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self.rucio_broker.get_size(rel_path), - ) - return False - def _create(self, obj, **kwargs): if not self._exists(obj, **kwargs): # Pull out locally used fields @@ -463,13 +387,6 @@ def _create(self, obj, **kwargs): log.debug("rucio _create: %s", rel_path) return self - def _empty(self, obj, **kwargs): - log.debug("rucio _empty") - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) > 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - def _size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) log.debug("rucio _size: %s", rel_path) @@ -482,10 +399,13 @@ def _size(self, obj, **kwargs): if size != 0: return size if self._exists(obj, **kwargs): - return self.rucio_broker.get_size(rel_path) + return self._get_remote_size(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 + def _get_remote_size(self, rel_path): + return self.rucio_broker.get_size(rel_path) + def _delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get("extra_dir", None) @@ -515,20 +435,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - log.debug("rucio _get_data: %s", rel_path) - auth_token = self._get_token(**kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path) or os.path.getsize(self._get_cache_path(rel_path)) == 0: - self._pull_into_cache(rel_path, auth_token) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - def _get_token(self, **kwargs): auth_token = kwargs.get("auth_token", None) if auth_token: @@ -649,13 +555,5 @@ def __build_kwargs(self, obj, **kwargs): kwargs["object_id"] = obj.id return kwargs - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 1caf355aec68..8c040e7523b8 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -3,13 +3,9 @@ """ import logging -import multiprocessing import os -import shutil -import subprocess import time from datetime import datetime -from typing import Optional try: # Imports are done this way to allow objectstore code to be used outside of Galaxy. @@ -20,23 +16,11 @@ except ImportError: boto = None # type: ignore[assignment] -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) -from galaxy.util import ( - directory_hash_id, - string_as_bool, - umask_fix_perms, - unlink, - which, -) -from galaxy.util.path import safe_relpath -from . import ConcreteObjectStore +from galaxy.util import string_as_bool +from ._caching_base import CachingConcreteObjectStore +from ._util import UsesAxel from .caching import ( - CacheTarget, enable_cache_monitor, - InProcessCacheMonitor, parse_caching_config_dict_from_xml, ) from .s3_multipart_upload import multipart_upload @@ -119,7 +103,7 @@ def parse_config_xml(config_xml): }, "cache": cache_dict, "extra_dirs": extra_dirs, - "private": ConcreteObjectStore.parse_private_from_config_xml(config_xml), + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), } except Exception: # Toss it back up after logging, we can't continue loading at this point. @@ -154,14 +138,13 @@ def _config_to_dict(self): } -class S3ObjectStore(ConcreteObjectStore, CloudConfigMixin): +class S3ObjectStore(CachingConcreteObjectStore, CloudConfigMixin, UsesAxel): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between Galaxy and S3. """ - cache_monitor: Optional[InProcessCacheMonitor] = None store_type = "aws_s3" def __init__(self, config, config_dict): @@ -215,18 +198,11 @@ def _initialize(self): "conn_path": self.conn_path, } + self._ensure_staging_path_writable() self._configure_connection() self._bucket = self._get_bucket(self.bucket) - self.start_cache_monitor() - # Test if 'axel' is available for parallel download and pull the key into cache - if which("axel"): - self.use_axel = True - else: - self.use_axel = False - - def start_cache_monitor(self): - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + self._start_cache_monitor_if_needed() + self._init_axel() def _configure_connection(self): log.debug("Configuring S3 Connection") @@ -261,14 +237,6 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - def _get_bucket(self, bucket_name): """Sometimes a handle to a bucket is not established right away so try it a few times. Raise error is connection is not established.""" @@ -288,73 +256,10 @@ def _get_bucket(self, bucket_name): # raise error raise S3ResponseError - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # S3 folders are marked by having trailing '/' so add it now - rel_path = f"{rel_path}/" - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - if in_cache: - return self._get_cache_path(rel_path) - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - def _get_transfer_progress(self): return self.transfer_progress - def _get_size_in_s3(self, rel_path): + def _get_remote_size(self, rel_path): try: key = self._bucket.get_key(rel_path) return key.size @@ -362,7 +267,7 @@ def _get_size_in_s3(self, rel_path): log.exception("Could not get size of key '%s' from S3", rel_path) return -1 - def _key_exists(self, rel_path): + def _exists_remotely(self, rel_path): exists = False try: # A hackish way of testing if the rel_path is a folder vs a file @@ -381,82 +286,35 @@ def _key_exists(self, rel_path): return False return exists - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - # log.debug("------ Checking cache for rel_path %s" % rel_path) - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - # TODO: Part of checking if a file is in cache should be to ensure the - # size of the cached file matches that on S3. Once the upload tool explicitly - # creates, this check sould be implemented- in the mean time, it's not - # looking likely to be implementable reliably. - # if os.path.exists(cache_path): - # # print("***1 %s exists" % cache_path) - # if self._key_exists(rel_path): - # # print("***2 %s exists in S3" % rel_path) - # # Make sure the size in cache is available in its entirety - # # print("File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path))) - # if os.path.getsize(cache_path) == self._get_size_in_s3(rel_path): - # # print("***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path) - # exists = True - # else: - # # print("***2.2 %s exists but differs in size from cache (in_cache=False)" % cache_path) - # exists = False - # else: - # # Although not perfect decision making, this most likely means - # # that the file is currently being uploaded - # # print("***3 %s found in cache but not in S3 (in_cache=True)" % cache_path) - # exists = True - # else: - # return False - - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) - return file_ok - def _transfer_cb(self, complete, total): self.transfer_progress += 10 def _download(self, rel_path): + local_destination = self._get_cache_path(rel_path) try: - log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) key = self._bucket.get_key(rel_path) if key is None: message = f"Attempting to download an invalid key for path {rel_path}." log.critical(message) raise Exception(message) - # Test if cache is large enough to hold the new file - if not self.cache_target.fits_in_cache(key.size): - log.critical( - "File %s is larger (%s) than the configured cache allows (%s). Cannot download.", - rel_path, - key.size, - self.cache_target.log_description, - ) + remote_size = key.size + if not self._caching_allowed(rel_path, remote_size): return False if self.use_axel: - log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) - ncores = multiprocessing.cpu_count() + log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) url = key.generate_url(7200) - ret_code = subprocess.call(["axel", "-a", "-n", str(ncores), url]) - if ret_code == 0: - return True + return self._axel_download(url, local_destination) else: - log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) self.transfer_progress = 0 # Reset transfer progress counter - key.get_contents_to_filename(self._get_cache_path(rel_path), cb=self._transfer_cb, num_cb=10) + key.get_contents_to_filename(local_destination, cb=self._transfer_cb, num_cb=10) return True except S3ResponseError: log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self._bucket.name) return False - def _push_to_os(self, rel_path, source_file=None, from_string=None): + def _push_to_storage(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the object store naming the key ``rel_path``. If ``source_file`` is provided, push that file instead while @@ -512,225 +370,29 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): raise return False - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_s3(rel_path): - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self._get_size_in_s3(rel_path), - ) - return False - - def _exists(self, obj, **kwargs): - in_cache = in_s3 = False - rel_path = self._construct_path(obj, **kwargs) - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - - # check job work directory stuff early to skip API hits. - if dir_only and base_dir: - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - return True - - # Check cache - if self._in_cache(rel_path): - in_cache = True - # Check S3 - in_s3 = self._key_exists(rel_path) - # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) - # dir_only does not get synced so shortcut the decision - if dir_only: - if in_cache or in_s3: - return True - else: - return False - - # TODO: Sync should probably not be done here. Add this to an async upload stack? - if in_cache and not in_s3: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) - return True - elif in_s3: + def _delete_remote_all(self, rel_path: str) -> bool: + try: + results = self._bucket.get_all_keys(prefix=rel_path) + for key in results: + log.debug("Deleting key %s", key.name) + key.delete() return True - else: + except S3ResponseError: + log.exception("Could not delete blob '%s' from S3", rel_path) return False - def _create(self, obj, **kwargs): - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - # Although not really necessary to create S3 folders (because S3 has - # flat namespace), do so for consistency with the regular file system - # S3 folders are marked by having trailing '/' so add it now - # s3_dir = '%s/' % rel_path - # self._push_to_os(s3_dir, from_string='') - # If instructed, create the dataset in cache & in S3 - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") - return self - - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) == 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - - def _size(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try S3. Error: %s", rel_path, ex) - elif self._exists(obj, **kwargs): - return self._get_size_in_s3(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - return 0 - - def _delete(self, obj, entire_dir=False, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - extra_dir = kwargs.get("extra_dir", None) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) + def _delete_existing_remote(self, rel_path: str) -> bool: try: - # Remove temparory data in JOB_WORK directory - if base_dir and dir_only and obj_dir: - shutil.rmtree(os.path.abspath(rel_path)) - return True - - # For the case of extra_files, because we don't have a reference to - # individual files/keys we need to remove the entire directory structure - # with all the files in it. This is easy for the local file system, - # but requires iterating through each individual key in S3 and deleing it. - if entire_dir and extra_dir: - shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - results = self._bucket.get_all_keys(prefix=rel_path) - for key in results: - log.debug("Deleting key %s", key.name) - key.delete() - return True - else: - # Delete from cache first - unlink(self._get_cache_path(rel_path), ignore_errors=True) - # Delete from S3 as well - if self._key_exists(rel_path): - key = Key(self._bucket, rel_path) - log.debug("Deleting key %s", key.name) - key.delete() - return True + key = Key(self._bucket, rel_path) + log.debug("Deleting key %s", key.name) + key.delete() + return True except S3ResponseError: - log.exception("Could not delete key '%s' from S3", rel_path) - except OSError: - log.exception("%s delete error", self._get_filename(obj, **kwargs)) - return False + log.exception("Could not delete blob '%s' from S3", rel_path) + return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path) or os.path.getsize(self._get_cache_path(rel_path)) == 0: - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - - def _get_filename(self, obj, **kwargs): - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - sync_cache = kwargs.get("sync_cache", True) - - rel_path = self._construct_path(obj, **kwargs) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # S3 does not recognize directories as files so cannot check if those exist. - # So, if checking dir only, ensure given dir exists in cache and return - # the expected cache path. - # dir_only = kwargs.get('dir_only', False) - # if dir_only: - # if not os.path.exists(cache_path): - # os.makedirs(cache_path) - # return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: - download_directory(self._bucket, rel_path, cache_path) - return cache_path - else: - if self._pull_into_cache(rel_path): - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path - - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - if create: - self._create(obj, **kwargs) - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Chose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - # Update the file on S3 - self._push_to_os(rel_path, source_file) - else: - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") + def _download_directory_into_cache(self, rel_path, cache_path): + download_directory(self._bucket, rel_path, cache_path) def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): @@ -746,7 +408,7 @@ def _get_store_usage_percent(self, obj): return 0.0 def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() class GenericS3ObjectStore(S3ObjectStore): diff --git a/lib/galaxy/objectstore/s3_boto3.py b/lib/galaxy/objectstore/s3_boto3.py new file mode 100644 index 000000000000..81dd41ac97a5 --- /dev/null +++ b/lib/galaxy/objectstore/s3_boto3.py @@ -0,0 +1,424 @@ +"""A more modern version of the S3 object store based on boto3 instead of boto. +""" + +import logging +import os +from typing import ( + Any, + Callable, + Dict, + TYPE_CHECKING, +) + +from typing_extensions import ( + Literal, + NotRequired, + TypedDict, +) + +if TYPE_CHECKING: + from mypy_boto3_c3.client import S3Client + +try: + # Imports are done this way to allow objectstore code to be used outside of Galaxy. + import boto3 + from boto3.s3.transfer import TransferConfig + from botocore.client import ClientError +except ImportError: + boto3 = None # type: ignore[assignment,unused-ignore] + TransferConfig = None # type: ignore[assignment,unused-ignore,misc] + +from galaxy.util import asbool +from ._caching_base import CachingConcreteObjectStore +from .caching import ( + enable_cache_monitor, + parse_caching_config_dict_from_xml, +) + +NO_BOTO_ERROR_MESSAGE = ( + "S3/Swift object store configured, but no boto3 dependency available." + "Please install and properly configure boto or modify object store configuration." +) + +log = logging.getLogger(__name__) +# This object store generates a lot of logging by default, fairly sure it is an anti-pattern +# to just disable library logging. +# logging.getLogger("botocore").setLevel(logging.INFO) +# logging.getLogger("s3transfer").setLevel(logging.INFO) + + +def host_to_endpoint(mapping): + # convert older-style boto parameters to boto3 endpoint_url. + host = mapping["host"] + port = mapping.get("port", 6000) + is_secure = asbool(mapping.get("is_secure", "True")) + conn_path = mapping.get("conn_path", "/") + scheme = "https" if is_secure else "http" + return f"{scheme}://{host}:{port}{conn_path}" + + +def parse_config_xml(config_xml): + try: + a_xml = config_xml.findall("auth")[0] + access_key = a_xml.get("access_key") + secret_key = a_xml.get("secret_key") + + b_xml = config_xml.findall("bucket")[0] + bucket_name = b_xml.get("name") + + cn_xml = config_xml.findall("connection") + if not cn_xml: + cn_xml = {} + else: + cn_xml = cn_xml[0] + endpoint_url = cn_xml.get("endpoint_url") + + # for admin ease - allow older style host, port, is_secure, conn_path to be used. + if endpoint_url is None and cn_xml.get("host") is not None: + endpoint_url = host_to_endpoint(cn_xml) + region = cn_xml.get("region") + cache_dict = parse_caching_config_dict_from_xml(config_xml) + + transfer_xml = config_xml.findall("transfer") + if not transfer_xml: + transfer_xml = {} + else: + transfer_xml = transfer_xml[0] + transfer_dict = {} + for prefix in ["", "upload_", "download_"]: + for key in [ + "multipart_threshold", + "max_concurrency", + "multipart_chunksize", + "num_download_attempts", + "max_io_queue", + "io_chunksize", + "use_threads", + "max_bandwidth", + ]: + full_key = f"{prefix}{key}" + value = transfer_xml.get(full_key) + if transfer_xml.get(full_key) is not None: + transfer_dict[full_key] = value + + tag, attrs = "extra_dir", ("type", "path") + extra_dirs = config_xml.findall(tag) + if not extra_dirs: + msg = f"No {tag} element in XML tree" + log.error(msg) + raise Exception(msg) + extra_dirs = [{k: e.get(k) for k in attrs} for e in extra_dirs] + + config_dict = { + "auth": { + "access_key": access_key, + "secret_key": secret_key, + }, + "bucket": { + "name": bucket_name, + }, + "connection": { + "endpoint_url": endpoint_url, + "region": region, + }, + "transfer": transfer_dict, + "cache": cache_dict, + "extra_dirs": extra_dirs, + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), + } + name = config_xml.attrib.get("name", None) + if name is not None: + config_dict["name"] = name + device = config_xml.attrib.get("device", None) + config_dict["device"] = device + return config_dict + except Exception: + # Toss it back up after logging, we can't continue loading at this point. + log.exception("Malformed ObjectStore Configuration XML -- unable to continue") + raise + + +class S3ClientConstructorKwds(TypedDict): + service_name: Literal["s3"] + endpoint_url: NotRequired[str] + region_name: NotRequired[str] + aws_access_key_id: NotRequired[str] + aws_secret_access_key: NotRequired[str] + + +class S3ObjectStore(CachingConcreteObjectStore): + """ + Object store that stores objects as items in an AWS S3 bucket. A local + cache exists that is used as an intermediate location for files between + Galaxy and S3. + """ + + _client: "S3Client" + store_type = "boto3" + cloud = True + + def __init__(self, config, config_dict): + super().__init__(config, config_dict) + self.cache_monitor = None + + auth_dict = config_dict["auth"] + bucket_dict = config_dict["bucket"] + connection_dict = config_dict.get("connection", {}) + cache_dict = config_dict.get("cache") or {} + transfer_dict = config_dict.get("transfer", {}) + typed_transfer_dict = {} + for prefix in ["", "upload_", "download_"]: + options: Dict[str, Callable[[Any], Any]] = { + "multipart_threshold": int, + "max_concurrency": int, + "multipart_chunksize": int, + "num_download_attempts": int, + "max_io_queue": int, + "io_chunksize": int, + "use_threads": asbool, + "max_bandwidth": int, + } + for key, key_type in options.items(): + full_key = f"{prefix}{key}" + transfer_value = transfer_dict.get(full_key) + if transfer_value is not None: + typed_transfer_dict[full_key] = key_type(transfer_value) + self.transfer_dict = typed_transfer_dict + + self.enable_cache_monitor, self.cache_monitor_interval = enable_cache_monitor(config, config_dict) + + self.access_key = auth_dict.get("access_key") + self.secret_key = auth_dict.get("secret_key") + + self.bucket = bucket_dict.get("name") + + self.endpoint_url = connection_dict.get("endpoint_url") + if self.endpoint_url is None and "host" in connection_dict: + self.endpoint_url = host_to_endpoint(connection_dict) + + self.region = connection_dict.get("region") + + self.cache_size = cache_dict.get("size") or self.config.object_store_cache_size + self.staging_path = cache_dict.get("path") or self.config.object_store_cache_path + self.cache_updated_data = cache_dict.get("cache_updated_data", True) + + extra_dirs = {e["type"]: e["path"] for e in config_dict.get("extra_dirs", [])} + self.extra_dirs.update(extra_dirs) + + self._initialize() + + def _initialize(self): + if boto3 is None: + raise Exception(NO_BOTO_ERROR_MESSAGE) + + self._ensure_staging_path_writable() + self._configure_connection() + self._start_cache_monitor_if_needed() + + def _configure_connection(self): + log.debug("Configuring S3 Connection") + self._init_client() + if not self._bucket_exists: + self._create_bucket() + + # get_object_url only works on AWS if client is set, so if it wasn't + # fetch it and reset the client now. Skip this logic entirely for other + # non-AWS services by ensuring endpoint_url is not set. + if not self.endpoint_url and not self.region: + response = self._client.get_bucket_location( + Bucket=self.bucket, + ) + if "LocationConstraint" in response: + region = response["LocationConstraint"] + self.region = region + self._init_client() + + def _init_client(self): + # set _client based on current args. + # If access_key is empty use default credential chain + kwds: S3ClientConstructorKwds = { + "service_name": "s3", + } + if self.endpoint_url: + kwds["endpoint_url"] = self.endpoint_url + if self.region: + kwds["region_name"] = self.region + if self.access_key: + kwds["aws_access_key_id"] = self.access_key + kwds["aws_secret_access_key"] = self.secret_key + self._client = boto3.client(**kwds) + + @property + def _bucket_exists(self) -> bool: + try: + self._client.head_bucket(Bucket=self.bucket) + return True + except ClientError as err: + if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404: + return False + raise + + def _create_bucket(self): + kwds = {} + if self.region: + kwds["CreateBucketConfiguration"] = dict(LocationConstraint=self.region) + self._client.create_bucket(Bucket=self.bucket, **kwds) + + @classmethod + def parse_xml(clazz, config_xml): + return parse_config_xml(config_xml) + + def _config_to_dict(self): + return { + "auth": { + "access_key": self.access_key, + "secret_key": self.secret_key, + }, + "bucket": { + "name": self.bucket, + }, + "connection": { + "endpoint_url": self.endpoint_url, + "region": self.region, + }, + "transfer": self.transfer_dict, + "cache": { + "size": self.cache_size, + "path": self.staging_path, + "cache_updated_data": self.cache_updated_data, + }, + } + + def to_dict(self): + as_dict = super().to_dict() + as_dict.update(self._config_to_dict()) + return as_dict + + def _get_remote_size(self, rel_path) -> int: + response = self._client.head_object(Bucket=self.bucket, Key=rel_path) + return response["ContentLength"] + + def _exists_remotely(self, rel_path: str) -> bool: + try: + is_dir = rel_path[-1] == "/" + if is_dir: + for _ in self._keys(rel_path): + return True + + return False + else: + self._client.head_object(Bucket=self.bucket, Key=rel_path) + return True + except ClientError as e: + if e.response["Error"]["Code"] == "404": + return False + raise + + def _download(self, rel_path: str) -> bool: + local_destination = self._get_cache_path(rel_path) + try: + log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) + if not self._caching_allowed(rel_path): + return False + config = self._transfer_config("download") + self._client.download_file(self.bucket, rel_path, local_destination, Config=config) + return True + except ClientError: + log.exception("Failed to download file from S3") + return False + + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: + try: + self._client.put_object(Body=from_string.encode("utf-8"), Bucket=self.bucket, Key=rel_path) + return True + except ClientError: + log.exception("Trouble pushing to S3 '%s' from string", rel_path) + return False + + def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: + try: + config = self._transfer_config("upload") + self._client.upload_file(source_file, self.bucket, rel_path, Config=config) + return True + except ClientError: + log.exception("Trouble pushing to S3 '%s' from file '%s'", rel_path, source_file) + return False + + def _delete_remote_all(self, rel_path: str) -> bool: + try: + for key in self._keys(rel_path): + self._client.delete_object(Bucket=self.bucket, Key=key) + return True + except ClientError: + log.exception("Could not delete blob '%s' from S3", rel_path) + return False + + def _delete_existing_remote(self, rel_path: str) -> bool: + try: + self._client.delete_object(Bucket=self.bucket, Key=rel_path) + return True + except ClientError: + log.exception("Could not delete blob '%s' from S3", rel_path) + return False + + # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 + def _keys(self, prefix="/", delimiter="/", start_after=""): + s3_paginator = self._client.get_paginator("list_objects_v2") + prefix = prefix.lstrip(delimiter) + start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after + for page in s3_paginator.paginate(Bucket=self.bucket, Prefix=prefix, StartAfter=start_after): + for content in page.get("Contents", ()): + yield content["Key"] + + def _download_directory_into_cache(self, rel_path, cache_path): + for key in self._keys(rel_path): + local_file_path = os.path.join(cache_path, os.path.relpath(key, rel_path)) + + # Create directories if they don't exist + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) + + # Download the file + self._client.download_file(self.bucket, key, local_file_path) + + def _get_object_url(self, obj, **kwargs): + try: + if self._exists(obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + url = self._client.generate_presigned_url( + ClientMethod="get_object", + Params={ + "Bucket": self.bucket, + "Key": rel_path, + }, + ExpiresIn=3600, + HttpMethod="GET", + ) + return url + except ClientError: + log.exception("Failed to generate URL for dataset.") + return None + + def _get_store_usage_percent(self, obj): + return 0.0 + + def _transfer_config(self, prefix: Literal["upload", "download"]) -> "TransferConfig": + config = {} + for key in [ + "multipart_threshold", + "max_concurrency", + "multipart_chunksize", + "num_download_attempts", + "max_io_queue", + "io_chunksize", + "use_threads", + "max_bandwidth", + ]: + specific_key = f"{prefix}_{key}" + if specific_key in self.transfer_dict: + config[key] = self.transfer_dict[specific_key] + elif key in self.transfer_dict: + config[key] = self.transfer_dict[key] + return TransferConfig(**config) + + def shutdown(self): + self._shutdown_cache_monitor() diff --git a/lib/galaxy/objectstore/unittest_utils/__init__.py b/lib/galaxy/objectstore/unittest_utils/__init__.py index 8807159b2437..158400b7aea9 100644 --- a/lib/galaxy/objectstore/unittest_utils/__init__.py +++ b/lib/galaxy/objectstore/unittest_utils/__init__.py @@ -1,6 +1,7 @@ """Utilities for configuring and using objectstores in unit tests.""" import os +import random from io import StringIO from shutil import rmtree from string import Template @@ -32,10 +33,22 @@ class Config: - def __init__(self, config_str=DISK_TEST_CONFIG, clazz=None, store_by="id", template_vars=None): + def __init__( + self, + config_str=DISK_TEST_CONFIG, + clazz=None, + store_by="id", + template_vars=None, + inject_galaxy_test_env=False, + ): self.temp_directory = mkdtemp() - template_vars = template_vars or {} + template_vars = {} template_vars["temp_directory"] = self.temp_directory + if inject_galaxy_test_env: + template_vars["test_random_int"] = random.randint(100000, 999999) + for key, value in os.environ.items(): + if key.startswith("GALAXY_TEST_"): + template_vars[key] = value self.template_vars = template_vars if config_str.startswith("<"): config_file = "store.xml" diff --git a/packages/objectstore/test-requirements.txt b/packages/objectstore/test-requirements.txt index e079f8a6038d..8077db870b39 100644 --- a/packages/objectstore/test-requirements.txt +++ b/packages/objectstore/test-requirements.txt @@ -1 +1,3 @@ pytest +boto3 +azure-storage-blob diff --git a/pyproject.toml b/pyproject.toml index 4799ac3ebc4c..95e88cf337bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,6 +130,7 @@ s3fs = "*" ase = ">=3.18.1" axe-selenium-python = "*" black = "*" +boto3 = "*" codespell = "*" cwltest = "*" darker = "*" @@ -183,6 +184,7 @@ types-python-dateutil = "*" types-PyYAML = "*" types-requests = "*" types-six = "*" +"boto3-stubs[s3]" = "*" [tool.ruff] target-version = "py38" diff --git a/test/integration/objectstore/test_objectstore_datatype_upload.py b/test/integration/objectstore/test_objectstore_datatype_upload.py index 3a17174bd66c..a383e43edaa1 100644 --- a/test/integration/objectstore/test_objectstore_datatype_upload.py +++ b/test/integration/objectstore/test_objectstore_datatype_upload.py @@ -234,7 +234,7 @@ def test_upload_datatype_irods_idle_connections( # Verify the connection pool has 0 active and 1 idle connections assert len(connection_pool.active) == 0 - assert len(connection_pool.idle) == 1 + assert len(connection_pool.idle) in [1, 2] # Wait for the idle connection to turn stale time.sleep(REFRESH_TIME) diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py index 44564acbc2be..6fe31f79bb44 100644 --- a/test/unit/objectstore/test_objectstore.py +++ b/test/unit/objectstore/test_objectstore.py @@ -1,5 +1,7 @@ import os +import shutil import time +from functools import wraps from tempfile import ( mkdtemp, mkstemp, @@ -11,6 +13,7 @@ from requests import get from galaxy.exceptions import ObjectInvalid +from galaxy.objectstore import persist_extra_files_for_dataset from galaxy.objectstore.azure_blob import AzureBlobObjectStore from galaxy.objectstore.caching import ( CacheTarget, @@ -19,8 +22,10 @@ reset_cache, ) from galaxy.objectstore.cloud import Cloud +from galaxy.objectstore.examples import get_example from galaxy.objectstore.pithos import PithosObjectStore from galaxy.objectstore.s3 import S3ObjectStore +from galaxy.objectstore.s3_boto3 import S3ObjectStore as Boto3ObjectStore from galaxy.objectstore.unittest_utils import ( Config as TestConfig, DISK_TEST_CONFIG, @@ -46,6 +51,11 @@ def _initialize(self): pass +class UninitializedBoto3ObjectStore(Boto3ObjectStore): + def _initialize(self): + pass + + class UninitializedAzureBlobObjectStore(AzureBlobObjectStore): def _initialize(self): pass @@ -56,6 +66,20 @@ def _initialize(self): pass +def patch_object_stores_to_skip_initialize(f): + + @wraps(f) + @patch("galaxy.objectstore.s3.S3ObjectStore", UninitializedS3ObjectStore) + @patch("galaxy.objectstore.s3_boto3.S3ObjectStore", UninitializedBoto3ObjectStore) + @patch("galaxy.objectstore.pithos.PithosObjectStore", UninitializedPithosObjectStore) + @patch("galaxy.objectstore.cloud.Cloud", UninitializedCloudObjectStore) + @patch("galaxy.objectstore.azure_blob.AzureBlobObjectStore", UninitializedAzureBlobObjectStore) + def wrapper(*args, **kwd): + f(*args, **kwd) + + return wrapper + + def test_unlink_path(): with pytest.raises(FileNotFoundError): unlink(uuid4().hex) @@ -236,61 +260,8 @@ def test_disk_store_alt_name_abspath(): pass -HIERARCHICAL_TEST_CONFIG = """ - - - - - This is our new storage cluster, check out the storage - on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage). - - - - - - - - This is our older legacy storage cluster, check out the storage - on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage). - - - - - - - -""" - -HIERARCHICAL_TEST_CONFIG_YAML = """ -type: hierarchical -backends: - - id: files1 - name: Newer Cool Storage - description: | - This is our new storage cluster, check out the storage - on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage). - type: disk - weight: 1 - files_dir: "${temp_directory}/files1" - extra_dirs: - - type: temp - path: "${temp_directory}/tmp1" - - type: job_work - path: "${temp_directory}/job_working_directory1" - - id: files2 - name: Older Legacy Storage - description: | - This is our older legacy storage cluster, check out the storage - on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage). - type: disk - weight: 1 - files_dir: "${temp_directory}/files2" - extra_dirs: - - type: temp - path: "${temp_directory}/tmp2" - - type: job_work - path: "${temp_directory}/job_working_directory2" -""" +HIERARCHICAL_TEST_CONFIG = get_example("hierarchical_simple.xml") +HIERARCHICAL_TEST_CONFIG_YAML = get_example("hierarchical_simple.yml") def test_hierarchical_store(): @@ -392,7 +363,6 @@ def test_mixed_private(): # Distributed object store can combine private and non-private concrete objectstores with TestConfig(MIXED_STORE_BY_DISTRIBUTED_TEST_CONFIG) as (directory, object_store): ids = object_store.object_store_ids() - print(ids) assert len(ids) == 2 ids = object_store.object_store_ids(private=True) @@ -424,40 +394,8 @@ def test_empty_cache_targets_for_disk_nested_stores(): assert len(object_store.cache_targets()) == 0 -BADGES_TEST_1_CONFIG_XML = """ - - - - - - - Fast interconnects. - - - Storage is backed up to tape nightly. - - -""" - - -BADGES_TEST_1_CONFIG_YAML = """ -type: disk -files_dir: "${temp_directory}/files1" -store_by: uuid -extra_dirs: - - type: temp - path: "${temp_directory}/tmp1" - - type: job_work - path: "${temp_directory}/job_working_directory1" -badges: - - type: short_term - - type: faster - message: Fast interconnects. - - type: less_stable - - type: more_secure - - type: backed_up - message: Storage is backed up to tape nightly. -""" +BADGES_TEST_1_CONFIG_XML = get_example("disk_badges.xml") +BADGES_TEST_1_CONFIG_YAML = get_example("disk_badges.yml") def test_badges_parsing(): @@ -524,54 +462,8 @@ def test_badges_parsing_conflicts(): assert exception_raised -DISTRIBUTED_TEST_CONFIG = """ - - - - - - - - - - - - - - - - -""" - - -DISTRIBUTED_TEST_CONFIG_YAML = """ -type: distributed -backends: - - id: files1 - quota: - source: 1files - type: disk - weight: 2 - device: primary_disk - files_dir: "${temp_directory}/files1" - extra_dirs: - - type: temp - path: "${temp_directory}/tmp1" - - type: job_work - path: "${temp_directory}/job_working_directory1" - - id: files2 - quota: - source: 2files - type: disk - weight: 1 - device: primary_disk - files_dir: "${temp_directory}/files2" - extra_dirs: - - type: temp - path: "${temp_directory}/tmp2" - - type: job_work - path: "${temp_directory}/job_working_directory2" -""" +DISTRIBUTED_TEST_CONFIG = get_example("distributed_disk.xml") +DISTRIBUTED_TEST_CONFIG_YAML = get_example("distributed_disk.yml") def test_distributed_store(): @@ -605,7 +497,6 @@ def test_distributed_store(): device_source_map = object_store.get_device_source_map() assert device_source_map - print(device_source_map.backends) assert device_source_map.get_device_id("files1") == "primary_disk" assert device_source_map.get_device_id("files2") == "primary_disk" @@ -616,48 +507,10 @@ def test_distributed_store_empty_cache_targets(): assert len(object_store.cache_targets()) == 0 -DISTRIBUTED_TEST_S3_CONFIG_YAML = """ -type: distributed -backends: - - id: files1 - weight: 1 - type: s3 - auth: - access_key: access_moo - secret_key: secret_cow - - bucket: - name: unique_bucket_name_all_lowercase - use_reduced_redundancy: false - - extra_dirs: - - type: job_work - path: ${temp_directory}/job_working_directory_s3 - - type: temp - path: ${temp_directory}/tmp_s3 - - id: files2 - weight: 1 - type: s3 - auth: - access_key: access_moo - secret_key: secret_cow - - bucket: - name: unique_bucket_name_all_lowercase_2 - use_reduced_redundancy: false - - extra_dirs: - - type: job_work - path: ${temp_directory}/job_working_directory_s3_2 - - type: temp - path: ${temp_directory}/tmp_s3_2 -""" - - -@patch("galaxy.objectstore.s3.S3ObjectStore", UninitializedS3ObjectStore) +@patch_object_stores_to_skip_initialize def test_distributed_store_with_cache_targets(): - for config_str in [DISTRIBUTED_TEST_S3_CONFIG_YAML]: - with TestConfig(config_str) as (directory, object_store): + for config_str in [get_example("distributed_s3.yml")]: + with TestConfig(config_str) as (_, object_store): assert len(object_store.cache_targets()) == 2 @@ -691,37 +544,14 @@ def test_hiercachical_backend_must_share_quota_source(): assert the_exception is not None -PITHOS_TEST_CONFIG = """ - - - - - - -""" - - -PITHOS_TEST_CONFIG_YAML = """ -type: pithos -auth: - url: http://example.org/ - token: extoken123 - -container: - name: foo - project: cow - -extra_dirs: - - type: temp - path: database/tmp_pithos - - type: job_work - path: database/working_pithos -""" +PITHOS_TEST_CONFIG = get_example("pithos_simple.xml") +PITHOS_TEST_CONFIG_YAML = get_example("pithos_simple.yml") +@patch_object_stores_to_skip_initialize def test_config_parse_pithos(): for config_str in [PITHOS_TEST_CONFIG, PITHOS_TEST_CONFIG_YAML]: - with TestConfig(config_str, clazz=UninitializedPithosObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): configured_config_dict = object_store.config_dict _assert_has_keys(configured_config_dict, ["auth", "container", "extra_dirs"]) @@ -755,42 +585,14 @@ def test_config_parse_pithos(): assert len(extra_dirs) == 2 -S3_TEST_CONFIG = """ - - - - - - -""" - - -S3_TEST_CONFIG_YAML = """ -type: s3 -private: true -auth: - access_key: access_moo - secret_key: secret_cow - -bucket: - name: unique_bucket_name_all_lowercase - use_reduced_redundancy: false - -cache: - path: database/object_store_cache - size: 1000 - -extra_dirs: -- type: job_work - path: database/job_working_directory_s3 -- type: temp - path: database/tmp_s3 -""" +S3_TEST_CONFIG = get_example("s3_simple.xml") +S3_TEST_CONFIG_YAML = get_example("s3_simple.yml") +@patch_object_stores_to_skip_initialize def test_config_parse_s3(): for config_str in [S3_TEST_CONFIG, S3_TEST_CONFIG_YAML]: - with TestConfig(config_str, clazz=UninitializedS3ObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): assert object_store.private assert object_store.access_key == "access_moo" assert object_store.secret_key == "secret_cow" @@ -838,140 +640,134 @@ def test_config_parse_s3(): assert len(extra_dirs) == 2 -S3_DEFAULT_CACHE_TEST_CONFIG = """ - - - - - -""" - - -S3_DEFAULT_CACHE_TEST_CONFIG_YAML = """ -type: s3 -private: true -auth: - access_key: access_moo - secret_key: secret_cow - -bucket: - name: unique_bucket_name_all_lowercase - use_reduced_redundancy: false - -extra_dirs: -- type: job_work - path: database/job_working_directory_s3 -- type: temp - path: database/tmp_s3 -""" +S3_DEFAULT_CACHE_TEST_CONFIG = get_example("s3_global_cache.xml") +S3_DEFAULT_CACHE_TEST_CONFIG_YAML = get_example("s3_global_cache.yml") +@patch_object_stores_to_skip_initialize def test_config_parse_s3_with_default_cache(): for config_str in [S3_DEFAULT_CACHE_TEST_CONFIG, S3_DEFAULT_CACHE_TEST_CONFIG_YAML]: - with TestConfig(config_str, clazz=UninitializedS3ObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): assert object_store.cache_size == -1 assert object_store.staging_path == directory.global_config.object_store_cache_path -CLOUD_AWS_TEST_CONFIG = """ - - - - - - -""" - - -CLOUD_AWS_TEST_CONFIG_YAML = """ -type: cloud -provider: aws -auth: - access_key: access_moo - secret_key: secret_cow - -bucket: - name: unique_bucket_name_all_lowercase - use_reduced_redundancy: false - -cache: - path: database/object_store_cache - size: 1000 - -extra_dirs: -- type: job_work - path: database/job_working_directory_cloud -- type: temp - path: database/tmp_cloud -""" +@patch_object_stores_to_skip_initialize +def test_config_parse_boto3(): + for config_str in [get_example("boto3_simple.xml"), get_example("boto3_simple.yml")]: + with TestConfig(config_str) as (directory, object_store): + assert object_store.access_key == "access_moo" + assert object_store.secret_key == "secret_cow" + assert object_store.bucket == "unique_bucket_name_all_lowercase" -CLOUD_AZURE_TEST_CONFIG = """ - - - - - - -""" + # defaults to AWS + assert object_store.endpoint_url is None -CLOUD_AZURE_TEST_CONFIG_YAML = """ -type: cloud -provider: azure -auth: - subscription_id: a_sub_id - client_id: and_a_client_id - secret: and_a_secret_key - tenant: and_some_tenant_info + cache_target = object_store.cache_target + assert cache_target.size == 1000 + assert cache_target.path == "database/object_store_cache" + assert object_store.extra_dirs["job_work"] == "database/job_working_directory_s3" + assert object_store.extra_dirs["temp"] == "database/tmp_s3" -bucket: - name: unique_bucket_name_all_lowercase - use_reduced_redundancy: false + as_dict = object_store.to_dict() + _assert_has_keys(as_dict, ["auth", "bucket", "connection", "cache", "extra_dirs", "type"]) -cache: - path: database/object_store_cache - size: 1000 + _assert_key_has_value(as_dict, "type", "boto3") -extra_dirs: -- type: job_work - path: database/job_working_directory_cloud -- type: temp - path: database/tmp_cloud -""" + auth_dict = as_dict["auth"] + bucket_dict = as_dict["bucket"] + cache_dict = as_dict["cache"] + _assert_key_has_value(auth_dict, "access_key", "access_moo") + _assert_key_has_value(auth_dict, "secret_key", "secret_cow") -CLOUD_GOOGLE_TEST_CONFIG = """ - - - - - - -""" + _assert_key_has_value(bucket_dict, "name", "unique_bucket_name_all_lowercase") -CLOUD_GOOGLE_TEST_CONFIG_YAML = """ -type: cloud -provider: google -auth: - credentials_file: gcp.config + _assert_key_has_value(cache_dict, "size", 1000) + _assert_key_has_value(cache_dict, "path", "database/object_store_cache") -bucket: - name: unique_bucket_name_all_lowercase - use_reduced_redundancy: false + extra_dirs = as_dict["extra_dirs"] + assert len(extra_dirs) == 2 -cache: - path: database/object_store_cache - size: 1000 -extra_dirs: -- type: job_work - path: database/job_working_directory_cloud -- type: temp - path: database/tmp_cloud -""" +@patch_object_stores_to_skip_initialize +def test_config_parse_boto3_custom_connection(): + for config_str in [get_example("boto3_custom_connection.xml"), get_example("boto3_custom_connection.yml")]: + with TestConfig(config_str) as (directory, object_store): + assert object_store.endpoint_url == "https://s3.example.org/" + assert object_store.region == "the_example_region" +@patch_object_stores_to_skip_initialize +def test_config_parse_boto3_merged_transfer_options(): + for config_str in [ + get_example("boto3_merged_transfer_options.xml"), + get_example("boto3_merged_transfer_options.yml"), + ]: + with TestConfig(config_str) as (directory, object_store): + as_dict = object_store.to_dict() + transfer_dict = as_dict["transfer"] + assert transfer_dict["multipart_threshold"] == 13 + assert transfer_dict["max_concurrency"] == 13 + assert transfer_dict["multipart_chunksize"] == 13 + assert transfer_dict["num_download_attempts"] == 13 + assert transfer_dict["max_io_queue"] == 13 + assert transfer_dict["io_chunksize"] == 13 + assert transfer_dict["use_threads"] is False + assert transfer_dict["max_bandwidth"] == 13 + + for transfer_type in ["upload", "download"]: + transfer_config = object_store._transfer_config(transfer_type) + assert transfer_config.multipart_threshold == 13 + assert transfer_config.max_concurrency == 13 + assert transfer_config.multipart_chunksize == 13 + assert transfer_config.num_download_attempts == 13 + assert transfer_config.max_io_queue == 13 + assert transfer_config.io_chunksize == 13 + assert transfer_config.use_threads is False + assert transfer_config.max_bandwidth == 13 + + +@patch_object_stores_to_skip_initialize +def test_config_parse_boto3_separated_transfer_options(): + for config_str in [ + get_example("boto3_separated_transfer_options.xml"), + get_example("boto3_separated_transfer_options.yml"), + ]: + with TestConfig(config_str) as (directory, object_store): + transfer_config = object_store._transfer_config("upload") + assert transfer_config.multipart_threshold == 13 + assert transfer_config.max_concurrency == 13 + assert transfer_config.multipart_chunksize == 13 + assert transfer_config.num_download_attempts == 13 + assert transfer_config.max_io_queue == 13 + assert transfer_config.io_chunksize == 13 + assert transfer_config.use_threads is False + assert transfer_config.max_bandwidth == 13 + + transfer_config = object_store._transfer_config("download") + assert transfer_config.multipart_threshold == 14 + assert transfer_config.max_concurrency == 14 + assert transfer_config.multipart_chunksize == 14 + assert transfer_config.num_download_attempts == 14 + assert transfer_config.max_io_queue == 14 + assert transfer_config.io_chunksize == 14 + assert transfer_config.use_threads is True + assert transfer_config.max_bandwidth == 14 + + +CLOUD_AWS_TEST_CONFIG = get_example("cloud_aws_simple.xml") +CLOUD_AWS_TEST_CONFIG_YAML = get_example("cloud_aws_simple.yml") + +CLOUD_AZURE_TEST_CONFIG = get_example("cloud_azure_simple.xml") +CLOUD_AZURE_TEST_CONFIG_YAML = get_example("cloud_azure_simple.yml") + +CLOUD_GOOGLE_TEST_CONFIG = get_example("cloud_gcp_simple.xml") +CLOUD_GOOGLE_TEST_CONFIG_YAML = get_example("cloud_gcp_simple.yml") + + +@patch_object_stores_to_skip_initialize def test_config_parse_cloud(): for config_str in [ CLOUD_AWS_TEST_CONFIG, @@ -988,7 +784,7 @@ def test_config_parse_cloud(): path = os.path.join(tmpdir, "gcp.config") open(path, "w").write("some_gcp_config") config_str = config_str.replace("gcp.config", path) - with TestConfig(config_str, clazz=UninitializedCloudObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): assert object_store.bucket_name == "unique_bucket_name_all_lowercase" assert object_store.use_rr is False @@ -1029,19 +825,13 @@ def test_config_parse_cloud(): assert len(extra_dirs) == 2 -CLOUD_AWS_NO_AUTH_TEST_CONFIG = """ - - - - - - -""" +CLOUD_AWS_NO_AUTH_TEST_CONFIG = get_example("cloud_aws_no_auth.xml") +@patch_object_stores_to_skip_initialize def test_config_parse_cloud_noauth_for_aws(): for config_str in [CLOUD_AWS_NO_AUTH_TEST_CONFIG]: - with TestConfig(config_str, clazz=UninitializedCloudObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): assert object_store.bucket_name == "unique_bucket_name_all_lowercase" assert object_store.use_rr is False @@ -1062,7 +852,6 @@ def test_config_parse_cloud_noauth_for_aws(): provider = as_dict["provider"] assert provider == "aws" - print(auth_dict["access_key"]) _assert_key_has_value(auth_dict, "access_key", None) _assert_key_has_value(auth_dict, "secret_key", None) @@ -1076,62 +865,29 @@ def test_config_parse_cloud_noauth_for_aws(): assert len(extra_dirs) == 2 -CLOUD_AWS_NO_CACHE_TEST_CONFIG = """ - - - - - -""" +CLOUD_AWS_NO_CACHE_TEST_CONFIG = get_example("cloud_aws_default_cache.xml") +@patch_object_stores_to_skip_initialize def test_config_parse_cloud_no_cache_for_aws(): for config_str in [CLOUD_AWS_NO_CACHE_TEST_CONFIG]: - with TestConfig(config_str, clazz=UninitializedCloudObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): assert object_store.staging_path == directory.global_config.object_store_cache_path assert object_store.cache_size == -1 -AZURE_BLOB_TEST_CONFIG = """ - - - - - - -""" - - -AZURE_BLOB_TEST_CONFIG_YAML = """ -type: azure_blob -auth: - account_name: azureact - account_key: password123 - -container: - name: unique_container_name - max_chunk_size: 250 - -cache: - path: database/object_store_cache - size: 100 - -extra_dirs: -- type: job_work - path: database/job_working_directory_azure -- type: temp - path: database/tmp_azure -""" +AZURE_BLOB_TEST_CONFIG = get_example("azure_simple.xml") +AZURE_BLOB_TEST_CONFIG_YAML = get_example("azure_simple.yml") +@patch_object_stores_to_skip_initialize def test_config_parse_azure(): for config_str in [AZURE_BLOB_TEST_CONFIG, AZURE_BLOB_TEST_CONFIG_YAML]: - with TestConfig(config_str, clazz=UninitializedAzureBlobObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): assert object_store.account_name == "azureact" assert object_store.account_key == "password123" assert object_store.container_name == "unique_container_name" - assert object_store.max_chunk_size == 250 cache_target = object_store.cache_target assert cache_target.size == 100 @@ -1152,7 +908,6 @@ def test_config_parse_azure(): _assert_key_has_value(auth_dict, "account_key", "password123") _assert_key_has_value(container_dict, "name", "unique_container_name") - _assert_key_has_value(container_dict, "max_chunk_size", 250) _assert_key_has_value(cache_dict, "size", 100) _assert_key_has_value(cache_dict, "path", "database/object_store_cache") @@ -1161,6 +916,18 @@ def test_config_parse_azure(): assert len(extra_dirs) == 2 +@patch_object_stores_to_skip_initialize +def test_config_parse_azure_transfer(): + for config_str in [get_example("azure_transfer.xml"), get_example("azure_transfer.yml")]: + with TestConfig(config_str) as (directory, object_store): + as_dict = object_store.to_dict()["transfer"] + assert as_dict["download_max_concurrency"] == 1 + assert as_dict["upload_max_concurrency"] == 2 + assert as_dict["max_single_put_size"] == 10 + assert as_dict["max_single_get_size"] == 20 + assert as_dict["max_block_size"] == 3 + + def test_cache_monitor_thread(tmp_path): cache_dir = tmp_path path = cache_dir / "a_file_0" @@ -1209,36 +976,14 @@ def test_fits_in_cache_check(tmp_path): assert noop_cache_target.fits_in_cache(1024 * 1024 * 1024 * 100) -AZURE_BLOB_NO_CACHE_TEST_CONFIG = """ - - - - - -""" - - -AZURE_BLOB_NO_CACHE_TEST_CONFIG_YAML = """ -type: azure_blob -auth: - account_name: azureact - account_key: password123 - -container: - name: unique_container_name - max_chunk_size: 250 - -extra_dirs: -- type: job_work - path: database/job_working_directory_azure -- type: temp - path: database/tmp_azure -""" +AZURE_BLOB_NO_CACHE_TEST_CONFIG = get_example("azure_default_cache.xml") +AZURE_BLOB_NO_CACHE_TEST_CONFIG_YAML = get_example("azure_default_cache.yml") +@patch_object_stores_to_skip_initialize def test_config_parse_azure_no_cache(): for config_str in [AZURE_BLOB_NO_CACHE_TEST_CONFIG, AZURE_BLOB_NO_CACHE_TEST_CONFIG_YAML]: - with TestConfig(config_str, clazz=UninitializedAzureBlobObjectStore) as (directory, object_store): + with TestConfig(config_str) as (directory, object_store): assert object_store.cache_size == -1 assert object_store.staging_path == directory.global_config.object_store_cache_path @@ -1321,6 +1066,52 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_ reset_cache(object_store.cache_target) assert not object_store.exists(to_delete_dataset) + # Test bigger file to force multi-process. + big_file_dataset = MockDataset(6) + size = 1024 + path = tmp_path / "big_file.bytes" + with path.open("wb") as f: + f.write(os.urandom(size)) + object_store.update_from_file(big_file_dataset, file_name=hello_path, create=True) + + extra_files_dataset = MockDataset(7) + object_store.create(extra_files_dataset) + extra = tmp_path / "extra" + extra.mkdir() + extra_file = extra / "new_value.txt" + extra_file.write_text("My new value") + + persist_extra_files_for_dataset( + object_store, + extra, + extra_files_dataset, # type: ignore[arg-type,unused-ignore] + extra_files_dataset._extra_files_rel_path, + ) + + # The following checks used to exhibit different behavior depending + # on how the cache was cleaned - removing the whole directory vs + # just cleaning up files the way Galaxy's internal caching works with + # reset_cache. So we test both here. + + # hard reset + shutil.rmtree(object_store.cache_target.path) + os.makedirs(object_store.cache_target.path) + + extra_path = _extra_file_path(object_store, extra_files_dataset) + assert os.path.exists(extra_path) + expected_extra_file = os.path.join(extra_path, "new_value.txt") + assert os.path.exists(expected_extra_file) + assert open(expected_extra_file).read() == "My new value" + + # Redo the above test with Galaxy's reset_cache which leaves empty directories + # around. + reset_cache(object_store.cache_target) + extra_path = _extra_file_path(object_store, extra_files_dataset) + assert os.path.exists(extra_path) + expected_extra_file = os.path.join(extra_path, "new_value.txt") + assert os.path.exists(expected_extra_file) + assert open(expected_extra_file).read() == "My new value" + # Test get_object_url returns a read-only URL url = object_store.get_object_url(hello_world_dataset) if check_get_url: @@ -1329,6 +1120,13 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_ assert response.text == "Hello World!" +def _extra_file_path(object_store, dataset): + # invoke the magic calls the model layer would invoke here... + if object_store.exists(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path): + return object_store.get_filename(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path) + return object_store.construct_path(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path, in_cache=True) + + def verify_object_store_functionality(tmp_path, object_store, check_get_url=True): # Test no dataset with id 1 exists. absent_dataset = MockDataset(1) @@ -1382,238 +1180,115 @@ def verify_object_store_functionality(tmp_path, object_store, check_get_url=True assert response.text == "Hello World!" -AZURE_BLOB_TEMPLATE_TEST_CONFIG_YAML = """ -type: azure_blob -store_by: uuid -auth: - account_name: ${account_name} - account_key: ${account_key} - -container: - name: ${container_name} - -extra_dirs: -- type: job_work - path: database/job_working_directory_azure -- type: temp - path: database/tmp_azure -""" +def integration_test_config(example_filename: str): + return TestConfig(get_example(example_filename), inject_galaxy_test_env=True) @skip_unless_environ("GALAXY_TEST_AZURE_CONTAINER_NAME") @skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_KEY") @skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_NAME") def test_real_azure_blob_store(tmp_path): - template_vars = { - "container_name": os.environ["GALAXY_TEST_AZURE_CONTAINER_NAME"], - "account_key": os.environ["GALAXY_TEST_AZURE_ACCOUNT_KEY"], - "account_name": os.environ["GALAXY_TEST_AZURE_ACCOUNT_NAME"], - } - with TestConfig(AZURE_BLOB_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store): + with integration_test_config("azure_integration_test.yml") as (_, object_store): verify_caching_object_store_functionality(tmp_path, object_store) -AZURE_BLOB_TEMPLATE_WITH_ACCOUNT_URL_TEST_CONFIG_YAML = """ -type: azure_blob -store_by: uuid -auth: - account_name: ${account_name} - account_key: ${account_key} - account_url: ${account_url} - -container: - name: ${container_name} - -extra_dirs: -- type: job_work - path: database/job_working_directory_azure -- type: temp - path: database/tmp_azure -""" - - @skip_unless_environ("GALAXY_TEST_AZURE_CONTAINER_NAME") @skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_KEY") @skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_NAME") @skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_URL") def test_real_azure_blob_store_with_account_url(tmp_path): - template_vars = { - "container_name": os.environ["GALAXY_TEST_AZURE_CONTAINER_NAME"], - "account_key": os.environ["GALAXY_TEST_AZURE_ACCOUNT_KEY"], - "account_name": os.environ["GALAXY_TEST_AZURE_ACCOUNT_NAME"], - "account_url": os.environ["GALAXY_TEST_AZURE_ACCOUNT_URL"], - } - with TestConfig(AZURE_BLOB_TEMPLATE_WITH_ACCOUNT_URL_TEST_CONFIG_YAML, template_vars=template_vars) as ( + with integration_test_config("azure_integration_test_with_account_url.yml") as ( _, object_store, ): verify_caching_object_store_functionality(tmp_path, object_store) -AZURE_BLOB_IN_HIERARCHICAL_TEMPLATE_TEST_CONFIG_YAML = """ -type: distributed -backends: -- type: azure_blob - id: azure1 - store_by: uuid - name: Azure Store 1 - allow_selection: true - weight: 1 - auth: - account_name: ${account_name} - account_key: ${account_key} - - container: - name: ${container_name} - - extra_dirs: - - type: job_work - path: database/job_working_directory_azure_1 - - type: temp - path: database/tmp_azure_1 -- type: azure_blob - id: azure2 - store_by: uuid - name: Azure Store 2 - allow_selection: true - weight: 1 - auth: - account_name: ${account_name} - account_key: ${account_key} - - container: - name: ${container_name} - - extra_dirs: - - type: job_work - path: database/job_working_directory_azure_2 - - type: temp - path: database/tmp_azure_2 -""" - - @skip_unless_environ("GALAXY_TEST_AZURE_CONTAINER_NAME") @skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_KEY") @skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_NAME") def test_real_azure_blob_store_in_hierarchical(tmp_path): - template_vars = { - "container_name": os.environ["GALAXY_TEST_AZURE_CONTAINER_NAME"], - "account_key": os.environ["GALAXY_TEST_AZURE_ACCOUNT_KEY"], - "account_name": os.environ["GALAXY_TEST_AZURE_ACCOUNT_NAME"], - } - with TestConfig(AZURE_BLOB_IN_HIERARCHICAL_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as ( - _, - object_store, - ): + with integration_test_config("azure_integration_test_distributed.yml") as (_, object_store): verify_object_store_functionality(tmp_path, object_store) -AMAZON_S3_SIMPLE_TEMPLATE_TEST_CONFIG_YAML = """ -type: aws_s3 -store_by: uuid -auth: - access_key: ${access_key} - secret_key: ${secret_key} - -bucket: - name: ${bucket} - -connection: - region: ${region} - -extra_dirs: -- type: job_work - path: database/job_working_directory_azure -- type: temp - path: database/tmp_azure -""" - - @skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") @skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") @skip_unless_environ("GALAXY_TEST_AWS_BUCKET") @skip_unless_environ("GALAXY_TEST_AWS_REGION") def test_real_aws_s3_store(tmp_path): - template_vars = { - "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"], - "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"], - "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"], - "region": os.environ["GALAXY_TEST_AWS_REGION"], - } - with TestConfig(AMAZON_S3_SIMPLE_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store): + with integration_test_config("aws_s3_integration_test.yml") as (_, object_store): verify_caching_object_store_functionality(tmp_path, object_store) -AMAZON_CLOUDBRIDGE_TEMPLATE_TEST_CONFIG_YAML = """ -type: cloud -store_by: uuid -provider: aws -auth: - access_key: ${access_key} - secret_key: ${secret_key} +@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_BUCKET") +def test_real_aws_s3_store_boto3(tmp_path): + with integration_test_config("boto3_integration_test_aws.yml") as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) -bucket: - name: ${bucket} -extra_dirs: -- type: job_work - path: database/job_working_directory_azure -- type: temp - path: database/tmp_azure -""" +@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_BUCKET") +def test_real_aws_s3_store_boto3_multipart(tmp_path): + with integration_test_config("boto3_integration_test_multithreaded.yml") as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) +@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") +def test_real_aws_s3_store_boto3_new_bucket(tmp_path): + with integration_test_config("boto3_integration_test_aws_new_bucket.yml") as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) + + +# this test fails if you have axel installed because axel requires URLs to work and that requires +# setting a region with the cloudbridge store. @skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") @skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") @skip_unless_environ("GALAXY_TEST_AWS_BUCKET") def test_aws_via_cloudbridge_store(tmp_path): - template_vars = { - "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"], - "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"], - "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"], - } - with TestConfig(AMAZON_CLOUDBRIDGE_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store): + with integration_test_config("cloud_integration_test_aws.yml") as (_, object_store): # disabling get_object_url check - cloudbridge in this config assumes the region # is us-east-1 and generates a URL for that region. This functionality works and can # be tested if a region is specified in the configuration (see next config and test case). verify_caching_object_store_functionality(tmp_path, object_store, check_get_url=False) -AMAZON_CLOUDBRIDGE_WITH_REGION_TEMPLATE_TEST_CONFIG_YAML = """ -type: cloud -store_by: uuid -provider: aws -auth: - access_key: ${access_key} - secret_key: ${secret_key} - region: ${region} - -bucket: - name: ${bucket} - -extra_dirs: -- type: job_work - path: database/job_working_directory_azure -- type: temp - path: database/tmp_azure -""" - - @skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") @skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") @skip_unless_environ("GALAXY_TEST_AWS_BUCKET") @skip_unless_environ("GALAXY_TEST_AWS_REGION") def test_aws_via_cloudbridge_store_with_region(tmp_path): - template_vars = { - "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"], - "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"], - "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"], - "region": os.environ["GALAXY_TEST_AWS_REGION"], - } - with TestConfig(AMAZON_CLOUDBRIDGE_WITH_REGION_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as ( - _, - object_store, - ): + with integration_test_config("cloud_integration_test_aws_with_region.yml") as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) + + +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET") +def test_gcp_via_s3_interop(tmp_path): + with integration_test_config("gcp_s3_integration_test.yml") as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) + + +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET") +def test_gcp_via_s3_interop_and_boto3(tmp_path): + with integration_test_config("gcp_boto3_integration_test.yml") as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) + + +# Ensure's boto3 will use legacy connection parameters that the generic_s3 object store +# would consume. +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET") +def test_gcp_via_s3_interop_and_boto3_with_legacy_params(tmp_path): + with integration_test_config("gcp_boto3_integration_test_legacy_params.yml") as (_, object_store): verify_caching_object_store_functionality(tmp_path, object_store) @@ -1628,6 +1303,10 @@ def rel_path_for_uuid_test(self): rel_path = os.path.join(*directory_hash_id(self.uuid)) return rel_path + @property + def _extra_files_rel_path(self): + return f"dataset_{self.uuid}_files" + def _assert_has_keys(the_dict, keys): for key in keys: