diff --git a/lib/galaxy/config/sample/object_store_conf.sample.yml b/lib/galaxy/config/sample/object_store_conf.sample.yml
index b1b2cb34afec..0c96a549f22d 100644
--- a/lib/galaxy/config/sample/object_store_conf.sample.yml
+++ b/lib/galaxy/config/sample/object_store_conf.sample.yml
@@ -135,10 +135,64 @@ backends:
store_by: uuid
files_dir: /old-fs/galaxy/files
+
+# There are now four ways to access S3 related services. Two are
+# suitable just for AWS services (aws_s3 & cloud), one is
+# more suited for non-AWS S3 compatible services (generic_s3),
+# and finally boto3 gracefully handles either scenario.
+#
+# boto3 is built on the newest and most widely used Python client
+# outside of Galaxy. It has advanced transfer options and is likely
+# the client you should use for new setup. generic_s3 and aws_s3
+# have existed in Galaxy for longer and could perhaps be considered
+# more battle tested. Both boto3 and generic_s3 have been tested
+# with multiple non-AWS APIs including minio and GCP. The cloud
+# implementation is based on CloudBridge and is still supported
+# and has been recently tested - the downside is mostly the advanced
+# multi-threaded processing options of boto3 are not available
+# and it has not been battle tested like aws_s3.
+
+#
+# Sample AWS S3 Object Store configuration (newest boto3 client)
+#
+type: boto3
+auth:
+ access_key: ...
+ secret_key: ...
+bucket:
+ name: unique_bucket_name_all_lowercase
+connection: # not strictly needed but more of the API works with this.
+ region: us-east-1
+transfer:
+ multipart_threshold: 10000000
+ download_max_concurrency: 5
+ upload_max_concurrency: 10
+ # any of these options:
+ # multipart_threshold, max_concurrency, multipart_chunksize,
+ # num_download_attempts, max_io_queue, io_chunksize, use_threads,
+ # and max_bandwidth
+ # can be set. By default they will apply to uploads and downloads
+ # but they can be prefixed with upload_ or download_ as shown above
+ # to apply to just one scenario. More information about these parameters
+ # can be found at:
+ # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig
+
+cache:
+ path: database/object_store_cache_s3
+ size: 1000
+ cache_updated_data: true
+extra_dirs:
+ - type: job_work
+ path: database/job_working_directory_s3
+
+
+
#
-# Sample AWS S3 Object Store configuration
+# Sample AWS S3 Object Store configuration (legacy boto implementation)
#
-
+# This implementation will use axel automatically for file transfers if it is on
+# Galaxy's path. Otherwise, it will use various python-based strategies for multi-part
+# upload of large uploads but all downloads will be single threaded.
type: aws_s3
auth:
access_key: ...
@@ -147,6 +201,8 @@ bucket:
name: unique_bucket_name_all_lowercase
use_reduced_redundancy: false
max_chunk_size: 250
+connection: # not strictly needed but more of the API works with this.
+ region: us-east-1
cache:
path: database/object_store_cache_s3
size: 1000
@@ -182,7 +238,32 @@ extra_dirs:
path: database/job_working_directory_irods
#
-# Sample non-AWS S3 Object Store (e.g. swift) configuration
+# Sample non-AWS S3 Object Store (e.g. swift) configuration (boto3)
+#
+
+type: boto3
+auth:
+ access_key: ...
+ secret_key: ...
+bucket:
+ name: unique_bucket_name_all_lowercase
+connection:
+ endpoint_url: https://swift.example.org:6000/
+ # region: some services may make use of region is specified.
+ # older style host, port, secure, and conn_path available to generic_s3 work
+ # here also - Galaxy will just infer a endpoint_url from those.
+cache:
+ path: database/object_store_cache_swift
+ size: 1000
+ cache_updated_data: true
+# transfer: # see transfer options for boto3 above in AWS configuration.
+extra_dirs:
+ - type: job_work
+ path: database/job_working_directory_swift
+
+
+#
+# Sample non-AWS S3 Object Store (e.g. swift) configuration (legacy boto client)
#
type: generic_s3
diff --git a/lib/galaxy/dependencies/__init__.py b/lib/galaxy/dependencies/__init__.py
index 0bb785aa136a..44322353329a 100644
--- a/lib/galaxy/dependencies/__init__.py
+++ b/lib/galaxy/dependencies/__init__.py
@@ -234,6 +234,9 @@ def check_python_pam(self):
def check_azure_storage(self):
return "azure_blob" in self.object_stores
+ def check_boto3(self):
+ return "boto3" in self.object_stores
+
def check_kamaki(self):
return "pithos" in self.object_stores
diff --git a/lib/galaxy/dependencies/dev-requirements.txt b/lib/galaxy/dependencies/dev-requirements.txt
index 57a4854417dc..9ac31e86937c 100644
--- a/lib/galaxy/dependencies/dev-requirements.txt
+++ b/lib/galaxy/dependencies/dev-requirements.txt
@@ -10,6 +10,7 @@ babel==2.14.0 ; python_version >= "3.8" and python_version < "3.13"
backports-tarfile==1.1.1 ; python_version >= "3.8" and python_version < "3.12"
backports-zoneinfo==0.2.1 ; python_version >= "3.8" and python_version < "3.9"
black==24.4.2 ; python_version >= "3.8" and python_version < "3.13"
+boto3==1.34.69 ; python_version >= "3.8" and python_version < "3.13"
build==1.2.1 ; python_version >= "3.8" and python_version < "3.13"
cachecontrol[filecache]==0.14.0 ; python_version >= "3.8" and python_version < "3.13"
certifi==2024.2.2 ; python_version >= "3.8" and python_version < "3.13"
diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py
index 62d49420e5e6..412228167971 100644
--- a/lib/galaxy/objectstore/__init__.py
+++ b/lib/galaxy/objectstore/__init__.py
@@ -55,7 +55,10 @@
from .caching import CacheTarget
if TYPE_CHECKING:
- from galaxy.model import DatasetInstance
+ from galaxy.model import (
+ Dataset,
+ DatasetInstance,
+ )
NO_SESSION_ERROR_MESSAGE = (
"Attempted to 'create' object store entity in configuration with no database session present."
@@ -373,16 +376,6 @@ def shutdown(self):
"""Close any connections for this ObjectStore."""
self.running = False
- def file_ready(
- self, obj, base_dir=None, dir_only=False, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False
- ):
- """
- Check if a file corresponding to a dataset is ready to be used.
-
- Return True if so, False otherwise
- """
- return True
-
@classmethod
def parse_xml(clazz, config_xml):
"""Parse an XML description of a configuration for this object store.
@@ -938,10 +931,6 @@ def _exists(self, obj, **kwargs):
"""Determine if the `obj` exists in any of the backends."""
return self._call_method("_exists", obj, False, False, **kwargs)
- def file_ready(self, obj, **kwargs):
- """Determine if the file for `obj` is ready to be used by any of the backends."""
- return self._call_method("file_ready", obj, False, False, **kwargs)
-
def _create(self, obj, **kwargs):
"""Create a backing file in a random backend."""
objectstore = random.choice(list(self.backends.values()))
@@ -1400,6 +1389,10 @@ def type_to_object_store_class(store: str, fsmon: bool = False) -> Tuple[Type[Ba
objectstore_constructor_kwds = {}
if store == "disk":
objectstore_class = DiskObjectStore
+ elif store == "boto3":
+ from .s3_boto3 import S3ObjectStore as Boto3ObjectStore
+
+ objectstore_class = Boto3ObjectStore
elif store in ["s3", "aws_s3"]:
from .s3 import S3ObjectStore
@@ -1672,18 +1665,27 @@ def persist_extra_files(
if not extra_files_path_name:
extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store)
assert extra_files_path_name
- for root, _dirs, files in safe_walk(src_extra_files_path):
- extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path))
- extra_dir = os.path.normpath(extra_dir)
- for f in files:
- if not in_directory(f, src_extra_files_path):
- # Unclear if this can ever happen if we use safe_walk ... probably not ?
- raise MalformedContents(f"Invalid dataset path: {f}")
- object_store.update_from_file(
- primary_data.dataset,
- extra_dir=extra_dir,
- alt_name=f,
- file_name=os.path.join(root, f),
- create=True,
- preserve_symlinks=True,
- )
+ persist_extra_files_for_dataset(object_store, src_extra_files_path, primary_data.dataset, extra_files_path_name)
+
+
+def persist_extra_files_for_dataset(
+ object_store: ObjectStore,
+ src_extra_files_path: str,
+ dataset: "Dataset",
+ extra_files_path_name: str,
+):
+ for root, _dirs, files in safe_walk(src_extra_files_path):
+ extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path))
+ extra_dir = os.path.normpath(extra_dir)
+ for f in files:
+ if not in_directory(f, src_extra_files_path):
+ # Unclear if this can ever happen if we use safe_walk ... probably not ?
+ raise MalformedContents(f"Invalid dataset path: {f}")
+ object_store.update_from_file(
+ dataset,
+ extra_dir=extra_dir,
+ alt_name=f,
+ file_name=os.path.join(root, f),
+ create=True,
+ preserve_symlinks=True,
+ )
diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py
new file mode 100644
index 000000000000..b63593ec7c50
--- /dev/null
+++ b/lib/galaxy/objectstore/_caching_base.py
@@ -0,0 +1,406 @@
+import logging
+import os
+import shutil
+from datetime import datetime
+from typing import (
+ Any,
+ Dict,
+ Optional,
+)
+
+from galaxy.exceptions import (
+ ObjectInvalid,
+ ObjectNotFound,
+)
+from galaxy.objectstore import ConcreteObjectStore
+from galaxy.util import (
+ directory_hash_id,
+ unlink,
+)
+from galaxy.util.path import safe_relpath
+from ._util import fix_permissions
+from .caching import (
+ CacheTarget,
+ InProcessCacheMonitor,
+)
+
+log = logging.getLogger(__name__)
+
+
+class CachingConcreteObjectStore(ConcreteObjectStore):
+ staging_path: str
+ extra_dirs: Dict[str, str]
+ config: Any
+ cache_updated_data: bool
+ enable_cache_monitor: bool
+ cache_size: int
+ cache_monitor: Optional[InProcessCacheMonitor] = None
+ cache_monitor_interval: int
+
+ def _ensure_staging_path_writable(self):
+ staging_path = self.staging_path
+ if not os.path.exists(staging_path):
+ os.makedirs(staging_path, exist_ok=True)
+ if not os.path.exists(staging_path):
+ raise Exception(f"Caching object store created with path '{staging_path}' that does not exist")
+
+ if not os.access(staging_path, os.R_OK):
+ raise Exception(f"Caching object store created with path '{staging_path}' that does not readable")
+ if not os.access(staging_path, os.W_OK):
+ raise Exception(f"Caching object store created with path '{staging_path}' that does not writable")
+
+ def _construct_path(
+ self,
+ obj,
+ base_dir=None,
+ dir_only=None,
+ extra_dir=None,
+ extra_dir_at_root=False,
+ alt_name=None,
+ obj_dir=False,
+ in_cache=False,
+ **kwargs,
+ ):
+ # extra_dir should never be constructed from provided data but just
+ # make sure there are no shenannigans afoot
+ if extra_dir and extra_dir != os.path.normpath(extra_dir):
+ log.warning("extra_dir is not normalized: %s", extra_dir)
+ raise ObjectInvalid("The requested object is invalid")
+ # ensure that any parent directory references in alt_name would not
+ # result in a path not contained in the directory path constructed here
+ if alt_name:
+ if not safe_relpath(alt_name):
+ log.warning("alt_name would locate path outside dir: %s", alt_name)
+ raise ObjectInvalid("The requested object is invalid")
+ # alt_name can contain parent directory references, but S3 will not
+ # follow them, so if they are valid we normalize them out
+ alt_name = os.path.normpath(alt_name)
+
+ object_id = self._get_object_id(obj)
+ rel_path = os.path.join(*directory_hash_id(object_id))
+
+ if extra_dir is not None:
+ if extra_dir_at_root:
+ rel_path = os.path.join(extra_dir, rel_path)
+ else:
+ rel_path = os.path.join(rel_path, extra_dir)
+
+ # for JOB_WORK directory
+ if obj_dir:
+ rel_path = os.path.join(rel_path, str(object_id))
+ if base_dir:
+ base = self.extra_dirs.get(base_dir)
+ assert base
+ return os.path.join(base, rel_path)
+
+ # This is how the remote file stores represent folders
+ rel_path = f"{rel_path}/"
+
+ if not dir_only:
+ rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{object_id}.dat")
+
+ if in_cache:
+ return self._get_cache_path(rel_path)
+
+ return rel_path
+
+ def _get_cache_path(self, rel_path: str) -> str:
+ return os.path.abspath(os.path.join(self.staging_path, rel_path))
+
+ def _in_cache(self, rel_path: str) -> bool:
+ """Check if the given dataset is in the local cache and return True if so."""
+ cache_path = self._get_cache_path(rel_path)
+ return os.path.exists(cache_path)
+
+ def _pull_into_cache(self, rel_path) -> bool:
+ # Ensure the cache directory structure exists (e.g., dataset_#_files/)
+ rel_path_dir = os.path.dirname(rel_path)
+ if not os.path.exists(self._get_cache_path(rel_path_dir)):
+ os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True)
+ # Now pull in the file
+ file_ok = self._download(rel_path)
+ if file_ok:
+ fix_permissions(self.config, self._get_cache_path(rel_path_dir))
+ else:
+ unlink(self._get_cache_path(rel_path), ignore_errors=True)
+ return file_ok
+
+ def _get_data(self, obj, start=0, count=-1, **kwargs):
+ rel_path = self._construct_path(obj, **kwargs)
+ # Check cache first and get file if not there
+ if not self._in_cache(rel_path):
+ self._pull_into_cache(rel_path)
+ # Read the file content from cache
+ data_file = open(self._get_cache_path(rel_path))
+ data_file.seek(start)
+ content = data_file.read(count)
+ data_file.close()
+ return content
+
+ def _exists(self, obj, **kwargs):
+ in_cache = exists_remotely = False
+ rel_path = self._construct_path(obj, **kwargs)
+ dir_only = kwargs.get("dir_only", False)
+ base_dir = kwargs.get("base_dir", None)
+
+ # check job work directory stuff early to skip API hits.
+ if dir_only and base_dir:
+ if not os.path.exists(rel_path):
+ os.makedirs(rel_path, exist_ok=True)
+ return True
+
+ in_cache = self._in_cache(rel_path)
+ exists_remotely = self._exists_remotely(rel_path)
+ dir_only = kwargs.get("dir_only", False)
+ base_dir = kwargs.get("base_dir", None)
+ if dir_only:
+ if in_cache or exists_remotely:
+ return True
+ else:
+ return False
+
+ # TODO: Sync should probably not be done here. Add this to an async upload stack?
+ if in_cache and not exists_remotely:
+ self._push_to_storage(rel_path, source_file=self._get_cache_path(rel_path))
+ return True
+ elif exists_remotely:
+ return True
+ else:
+ return False
+
+ def _create(self, obj, **kwargs):
+ if not self._exists(obj, **kwargs):
+ # Pull out locally used fields
+ extra_dir = kwargs.get("extra_dir", None)
+ extra_dir_at_root = kwargs.get("extra_dir_at_root", False)
+ dir_only = kwargs.get("dir_only", False)
+ alt_name = kwargs.get("alt_name", None)
+
+ # Construct hashed path
+ rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
+
+ # Optionally append extra_dir
+ if extra_dir is not None:
+ if extra_dir_at_root:
+ rel_path = os.path.join(extra_dir, rel_path)
+ else:
+ rel_path = os.path.join(rel_path, extra_dir)
+
+ # Create given directory in cache
+ cache_dir = os.path.join(self.staging_path, rel_path)
+ if not os.path.exists(cache_dir):
+ os.makedirs(cache_dir, exist_ok=True)
+
+ # If instructed, create the dataset in cache & in S3
+ if not dir_only:
+ rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
+ open(os.path.join(self.staging_path, rel_path), "w").close()
+ self._push_to_storage(rel_path, from_string="")
+ return self
+
+ def _caching_allowed(self, rel_path: str, remote_size: Optional[int] = None) -> bool:
+ if remote_size is None:
+ remote_size = self._get_remote_size(rel_path)
+ if not self.cache_target.fits_in_cache(remote_size):
+ log.critical(
+ "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.",
+ rel_path,
+ remote_size,
+ self.cache_target.log_description,
+ )
+ return False
+ return True
+
+ def _push_to_storage(self, rel_path, source_file=None, from_string=None):
+ source_file = source_file or self._get_cache_path(rel_path)
+ if from_string is None and not os.path.exists(source_file):
+ log.error(
+ "Tried updating remote path '%s' from source file '%s', but source file does not exist.",
+ rel_path,
+ source_file,
+ )
+ return False
+
+ if from_string is None and os.path.getsize(source_file) == 0:
+ log.debug(
+ "Wanted to push file '%s' to remote path '%s' but its size is 0; skipping.", source_file, rel_path
+ )
+ return True
+
+ if from_string is not None:
+ return self._push_string_to_path(rel_path, from_string)
+ else:
+ start_time = datetime.now()
+ log.debug(
+ "Pushing cache file '%s' of size %s bytes to '%s'",
+ source_file,
+ os.path.getsize(source_file),
+ rel_path,
+ )
+ success = self._push_file_to_path(rel_path, source_file)
+ end_time = datetime.now()
+ log.debug(
+ "Pushed cache file '%s' to blob '%s' (%s bytes transferred in %s sec)",
+ source_file,
+ rel_path,
+ os.path.getsize(source_file),
+ end_time - start_time,
+ )
+ return success
+
+ def _empty(self, obj, **kwargs):
+ if self._exists(obj, **kwargs):
+ return self._size(obj, **kwargs) == 0
+ else:
+ raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}")
+
+ def _size(self, obj, **kwargs):
+ rel_path = self._construct_path(obj, **kwargs)
+ if self._in_cache(rel_path):
+ try:
+ return os.path.getsize(self._get_cache_path(rel_path))
+ except OSError as ex:
+ log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex)
+ elif self._exists_remotely(rel_path):
+ return self._get_remote_size(rel_path)
+ log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
+ return 0
+
+ def _get_filename(self, obj, **kwargs):
+ base_dir = kwargs.get("base_dir", None)
+ dir_only = kwargs.get("dir_only", False)
+ obj_dir = kwargs.get("obj_dir", False)
+ sync_cache = kwargs.get("sync_cache", True)
+
+ rel_path = self._construct_path(obj, **kwargs)
+
+ # for JOB_WORK directory
+ if base_dir and dir_only and obj_dir:
+ return os.path.abspath(rel_path)
+
+ cache_path = self._get_cache_path(rel_path)
+ if not sync_cache:
+ return cache_path
+
+ # Check if the file exists in the cache first, always pull if file size in cache is zero
+ # For dir_only - the cache cleaning may have left empty directories so I think we need to
+ # always resync the cache. Gotta make sure we're being judicious in out data.extra_files_path
+ # calls I think.
+ if not dir_only and self._in_cache(rel_path) and os.path.getsize(self._get_cache_path(rel_path)) > 0:
+ return cache_path
+
+ # Check if the file exists in persistent storage and, if it does, pull it into cache
+ elif self._exists(obj, **kwargs):
+ if dir_only:
+ self._download_directory_into_cache(rel_path, cache_path)
+ return cache_path
+ else:
+ if self._pull_into_cache(rel_path):
+ return cache_path
+ raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}")
+
+ def _download_directory_into_cache(self, rel_path, cache_path):
+ # pithos & irods never did this prior to refactoring so I am assuming
+ # there is just operations that fail with these object stores.
+ # As part of the refactoring that resulted in this method
+ # https://github.com/galaxyproject/galaxy/pull/18117 I wrote test
+ # cases and I verified the other object stores that didn't implement
+ # this had issues - I implemented this new functionality in the
+ # Azure and Cloud object stores to fix those object stores. New
+ # object stores should definitely override this.
+ pass
+
+ def _delete(self, obj, entire_dir=False, **kwargs):
+ rel_path = self._construct_path(obj, **kwargs)
+ extra_dir = kwargs.get("extra_dir", None)
+ base_dir = kwargs.get("base_dir", None)
+ dir_only = kwargs.get("dir_only", False)
+ obj_dir = kwargs.get("obj_dir", False)
+ try:
+ # Remove temporary data in JOB_WORK directory
+ if base_dir and dir_only and obj_dir:
+ shutil.rmtree(os.path.abspath(rel_path))
+ return True
+
+ # For the case of extra_files, because we don't have a reference to
+ # individual files/keys we need to remove the entire directory structure
+ # with all the files in it. This is easy for the local file system,
+ # but requires iterating through each individual key in S3 and deleing it.
+ if entire_dir and extra_dir:
+ shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True)
+ return self._delete_remote_all(rel_path)
+ else:
+ # Delete from cache first
+ unlink(self._get_cache_path(rel_path), ignore_errors=True)
+ # Delete from S3 as well
+ if self._exists_remotely(rel_path):
+ return self._delete_existing_remote(rel_path)
+ except OSError:
+ log.exception("%s delete error", self._get_filename(obj, **kwargs))
+ return False
+
+ def _update_from_file(self, obj, file_name=None, create=False, **kwargs):
+ if create:
+ self._create(obj, **kwargs)
+
+ if self._exists(obj, **kwargs):
+ rel_path = self._construct_path(obj, **kwargs)
+ # Chose whether to use the dataset file itself or an alternate file
+ if file_name:
+ source_file = os.path.abspath(file_name)
+ # Copy into cache
+ cache_file = self._get_cache_path(rel_path)
+ try:
+ if source_file != cache_file and self.cache_updated_data:
+ # FIXME? Should this be a `move`?
+ shutil.copy2(source_file, cache_file)
+ fix_permissions(self.config, cache_file)
+ except OSError:
+ log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
+ else:
+ source_file = self._get_cache_path(rel_path)
+
+ self._push_to_storage(rel_path, source_file)
+
+ else:
+ raise ObjectNotFound(
+ f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}"
+ )
+
+ @property
+ def cache_target(self) -> CacheTarget:
+ return CacheTarget(
+ self.staging_path,
+ self.cache_size,
+ 0.9,
+ )
+
+ def _shutdown_cache_monitor(self) -> None:
+ self.cache_monitor and self.cache_monitor.shutdown()
+
+ def _start_cache_monitor_if_needed(self):
+ if self.enable_cache_monitor:
+ self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval)
+
+ def _get_remote_size(self, rel_path: str) -> int:
+ raise NotImplementedError()
+
+ def _exists_remotely(self, rel_path: str) -> bool:
+ raise NotImplementedError()
+
+ def _download(self, rel_path: str) -> bool:
+ raise NotImplementedError()
+
+ # Do not need to override these if instead replacing _delete
+ def _delete_existing_remote(self, rel_path) -> bool:
+ raise NotImplementedError()
+
+ def _delete_remote_all(self, rel_path) -> bool:
+ raise NotImplementedError()
+
+ # Do not need to override these if instead replacing _push_to_storage
+ def _push_string_to_path(self, rel_path: str, from_string: str) -> bool:
+ raise NotImplementedError()
+
+ def _push_file_to_path(self, rel_path: str, target_file: str) -> bool:
+ raise NotImplementedError()
diff --git a/lib/galaxy/objectstore/_util.py b/lib/galaxy/objectstore/_util.py
new file mode 100644
index 000000000000..fbdf9adde4f6
--- /dev/null
+++ b/lib/galaxy/objectstore/_util.py
@@ -0,0 +1,35 @@
+import multiprocessing
+import os
+import subprocess
+
+from galaxy.util import (
+ umask_fix_perms,
+ which,
+)
+
+
+def fix_permissions(config, rel_path: str):
+ """Set permissions on rel_path"""
+ for basedir, _, files in os.walk(rel_path):
+ umask_fix_perms(basedir, config.umask, 0o777, config.gid)
+ for filename in files:
+ path = os.path.join(basedir, filename)
+ # Ignore symlinks
+ if os.path.islink(path):
+ continue
+ umask_fix_perms(path, config.umask, 0o666, config.gid)
+
+
+class UsesAxel:
+ use_axel: bool
+
+ def _init_axel(self) -> None:
+ if which("axel"):
+ self.use_axel = True
+ else:
+ self.use_axel = False
+
+ def _axel_download(self, url: str, path: str):
+ ncores = multiprocessing.cpu_count()
+ ret_code = subprocess.call(["axel", "-a", "-o", path, "-n", str(ncores), url])
+ return ret_code == 0
diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py
index 39e3c7490eb1..d3008ac77ccc 100644
--- a/lib/galaxy/objectstore/azure_blob.py
+++ b/lib/galaxy/objectstore/azure_blob.py
@@ -4,12 +4,10 @@
import logging
import os
-import shutil
from datetime import (
datetime,
timedelta,
)
-from typing import Optional
try:
from azure.common import AzureHttpError
@@ -19,23 +17,11 @@
generate_blob_sas,
)
except ImportError:
- BlobServiceClient = None
+ BlobServiceClient = None # type: ignore[assignment,unused-ignore,misc]
-from galaxy.exceptions import (
- ObjectInvalid,
- ObjectNotFound,
-)
-from galaxy.util import (
- directory_hash_id,
- umask_fix_perms,
- unlink,
-)
-from galaxy.util.path import safe_relpath
-from . import ConcreteObjectStore
+from ._caching_base import CachingConcreteObjectStore
from .caching import (
- CacheTarget,
enable_cache_monitor,
- InProcessCacheMonitor,
parse_caching_config_dict_from_xml,
)
@@ -57,7 +43,24 @@ def parse_config_xml(config_xml):
container_xml = config_xml.find("container")
container_name = container_xml.get("name")
- max_chunk_size = int(container_xml.get("max_chunk_size", 250)) # currently unused
+
+ transfer_xml = config_xml.findall("transfer")
+ if not transfer_xml:
+ transfer_xml = {}
+ else:
+ transfer_xml = transfer_xml[0]
+ transfer_dict = {}
+ for key in [
+ "max_concurrency",
+ "download_max_concurrency",
+ "upload_max_concurrency",
+ "max_single_put_size",
+ "max_single_get_size",
+ "max_block_size",
+ ]:
+ value = transfer_xml.get(key)
+ if transfer_xml.get(key) is not None:
+ transfer_dict[key] = value
cache_dict = parse_caching_config_dict_from_xml(config_xml)
@@ -79,11 +82,11 @@ def parse_config_xml(config_xml):
"auth": auth,
"container": {
"name": container_name,
- "max_chunk_size": max_chunk_size,
},
"cache": cache_dict,
+ "transfer": transfer_dict,
"extra_dirs": extra_dirs,
- "private": ConcreteObjectStore.parse_private_from_config_xml(config_xml),
+ "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml),
}
except Exception:
# Toss it back up after logging, we can't continue loading at this point.
@@ -91,14 +94,13 @@ def parse_config_xml(config_xml):
raise
-class AzureBlobObjectStore(ConcreteObjectStore):
+class AzureBlobObjectStore(CachingConcreteObjectStore):
"""
Object store that stores objects as blobs in an Azure Blob Container. A local
cache exists that is used as an intermediate location for files between
Galaxy and Azure.
"""
- cache_monitor: Optional[InProcessCacheMonitor] = None
store_type = "azure_blob"
def __init__(self, config, config_dict):
@@ -114,7 +116,20 @@ def __init__(self, config, config_dict):
self.account_key = auth_dict.get("account_key")
self.container_name = container_dict.get("name")
- self.max_chunk_size = container_dict.get("max_chunk_size", 250) # currently unused
+ raw_transfer_dict = config_dict.get("transfer", {})
+ typed_transfer_dict = {}
+ for key in [
+ "max_concurrency",
+ "download_max_concurrency",
+ "upload_max_concurrency",
+ "max_single_put_size",
+ "max_single_get_size",
+ "max_block_size",
+ ]:
+ value = raw_transfer_dict.get(key)
+ if value is not None:
+ typed_transfer_dict[key] = int(value)
+ self.transfer_dict = typed_transfer_dict
self.cache_size = cache_dict.get("size") or self.config.object_store_cache_size
self.staging_path = cache_dict.get("path") or self.config.object_store_cache_path
@@ -127,9 +142,8 @@ def _initialize(self):
raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE)
self._configure_connection()
-
- if self.enable_cache_monitor:
- self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval)
+ self._ensure_staging_path_writable()
+ self._start_cache_monitor_if_needed()
def to_dict(self):
as_dict = super().to_dict()
@@ -144,8 +158,8 @@ def to_dict(self):
"auth": auth,
"container": {
"name": self.container_name,
- "max_chunk_size": self.max_chunk_size,
},
+ "transfer": self.transfer_dict,
"cache": {
"size": self.cache_size,
"path": self.staging_path,
@@ -155,10 +169,6 @@ def to_dict(self):
)
return as_dict
- ###################
- # Private Methods #
- ###################
-
# config_xml is an ElementTree object.
@classmethod
def parse_xml(clazz, config_xml):
@@ -166,87 +176,31 @@ def parse_xml(clazz, config_xml):
def _configure_connection(self):
log.debug("Configuring Connection")
+ extra_kwds = {}
+ for key in [
+ "max_single_put_size",
+ "max_single_get_size",
+ "max_block_size",
+ ]:
+ if key in self.transfer_dict:
+ extra_kwds[key] = self.transfer_dict[key]
+
if self.account_url:
# https://pypi.org/project/azure-storage-blob/
service = BlobServiceClient(
account_url=self.account_url,
credential={"account_name": self.account_name, "account_key": self.account_key},
+ **extra_kwds,
)
else:
service = BlobServiceClient(
account_url=f"https://{self.account_name}.blob.core.windows.net",
credential=self.account_key,
+ **extra_kwds,
)
self.service = service
- def _construct_path(
- self,
- obj,
- base_dir=None,
- dir_only=None,
- extra_dir=None,
- extra_dir_at_root=False,
- alt_name=None,
- obj_dir=False,
- in_cache=False,
- **kwargs,
- ):
- # extra_dir should never be constructed from provided data but just
- # make sure there are no shenannigans afoot
- if extra_dir and extra_dir != os.path.normpath(extra_dir):
- log.warning("extra_dir is not normalized: %s", extra_dir)
- raise ObjectInvalid("The requested object is invalid")
- # ensure that any parent directory references in alt_name would not
- # result in a path not contained in the directory path constructed here
- if alt_name:
- if not safe_relpath(alt_name):
- log.warning("alt_name would locate path outside dir: %s", alt_name)
- raise ObjectInvalid("The requested object is invalid")
- # alt_name can contain parent directory references, but S3 will not
- # follow them, so if they are valid we normalize them out
- alt_name = os.path.normpath(alt_name)
-
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
-
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # for JOB_WORK directory
- if obj_dir:
- rel_path = os.path.join(rel_path, str(self._get_object_id(obj)))
- if base_dir:
- base = self.extra_dirs.get(base_dir)
- return os.path.join(base, rel_path)
-
- # S3 folders are marked by having trailing '/' so add it now
- # rel_path = '%s/' % rel_path # assume for now we don't need this in Azure blob storage.
-
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
-
- if in_cache:
- return self._get_cache_path(rel_path)
-
- return rel_path
-
- def _fix_permissions(self, rel_path):
- """Set permissions on rel_path"""
- for basedir, _, files in os.walk(rel_path):
- umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
- for filename in files:
- path = os.path.join(basedir, filename)
- # Ignore symlinks
- if os.path.islink(path):
- continue
- umask_fix_perms(path, self.config.umask, 0o666, self.config.gid)
-
- def _get_cache_path(self, rel_path):
- return os.path.abspath(os.path.join(self.staging_path, rel_path))
-
- def _get_size_in_azure(self, rel_path):
+ def _get_remote_size(self, rel_path):
try:
properties = self._blob_client(rel_path).get_blob_properties()
size_in_bytes = properties.size
@@ -255,9 +209,20 @@ def _get_size_in_azure(self, rel_path):
log.exception("Could not get size of blob '%s' from Azure", rel_path)
return -1
- def _in_azure(self, rel_path):
+ def _blobs_from(self, rel_path):
+ return self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path)
+
+ def _exists_remotely(self, rel_path: str):
try:
- exists = self._blob_client(rel_path).exists()
+ is_dir = rel_path[-1] == "/"
+ if is_dir:
+ blobs = self._blobs_from(rel_path)
+ if blobs:
+ return True
+ else:
+ return False
+ else:
+ exists = self._blob_client(rel_path).exists()
except AzureHttpError:
log.exception("Trouble checking existence of Azure blob '%s'", rel_path)
return False
@@ -266,308 +231,82 @@ def _in_azure(self, rel_path):
def _blob_client(self, rel_path: str):
return self.service.get_blob_client(self.container_name, rel_path)
- def _in_cache(self, rel_path):
- """Check if the given dataset is in the local cache."""
- cache_path = self._get_cache_path(rel_path)
- return os.path.exists(cache_path)
-
- def _pull_into_cache(self, rel_path):
- # Ensure the cache directory structure exists (e.g., dataset_#_files/)
- rel_path_dir = os.path.dirname(rel_path)
- if not os.path.exists(self._get_cache_path(rel_path_dir)):
- os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True)
- # Now pull in the file
- file_ok = self._download(rel_path)
- self._fix_permissions(self._get_cache_path(rel_path_dir))
- return file_ok
-
def _download(self, rel_path):
local_destination = self._get_cache_path(rel_path)
try:
log.debug("Pulling '%s' into cache to %s", rel_path, local_destination)
- if not self.cache_target.fits_in_cache(self._get_size_in_azure(rel_path)):
- log.critical(
- "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.",
- rel_path,
- self._get_size_in_azure(rel_path),
- self.cache_target.log_description,
- )
+ if not self._caching_allowed(rel_path):
return False
else:
- with open(local_destination, "wb") as f:
- self._blob_client(rel_path).download_blob().download_to_stream(f)
+ self._download_to_file(rel_path, local_destination)
return True
except AzureHttpError:
log.exception("Problem downloading '%s' from Azure", rel_path)
return False
- def _push_to_os(self, rel_path, source_file=None, from_string=None):
- """
- Push the file pointed to by ``rel_path`` to the object store naming the blob
- ``rel_path``. If ``source_file`` is provided, push that file instead while
- still using ``rel_path`` as the blob name.
- If ``from_string`` is provided, set contents of the file to the value of
- the string.
- """
- try:
- source_file = source_file or self._get_cache_path(rel_path)
+ def _download_to_file(self, rel_path, local_destination):
+ kwd = {}
+ max_concurrency = self.transfer_dict.get("download_max_concurrency") or self.transfer_dict.get(
+ "max_concurrency"
+ )
+ if max_concurrency is not None:
+ kwd["max_concurrency"] = max_concurrency
+ with open(local_destination, "wb") as f:
+ self._blob_client(rel_path).download_blob().download_to_stream(f, **kwd)
- if from_string is None and not os.path.exists(source_file):
- log.error(
- "Tried updating blob '%s' from source file '%s', but source file does not exist.",
- rel_path,
- source_file,
- )
- return False
+ def _download_directory_into_cache(self, rel_path, cache_path):
+ blobs = self._blobs_from(rel_path)
+ for blob in blobs:
+ key = blob.name
+ local_file_path = os.path.join(cache_path, os.path.relpath(key, rel_path))
- if from_string is None and os.path.getsize(source_file) == 0:
- log.debug(
- "Wanted to push file '%s' to azure blob '%s' but its size is 0; skipping.", source_file, rel_path
- )
- return True
+ # Create directories if they don't exist
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
- if from_string is not None:
- self._blob_client(rel_path).upload_blob(from_string, overwrite=True)
- log.debug("Pushed data from string '%s' to blob '%s'", from_string, rel_path)
- else:
- start_time = datetime.now()
- log.debug(
- "Pushing cache file '%s' of size %s bytes to '%s'",
- source_file,
- os.path.getsize(source_file),
- rel_path,
- )
- with open(source_file, "rb") as f:
- self._blob_client(rel_path).upload_blob(f, overwrite=True)
- end_time = datetime.now()
- log.debug(
- "Pushed cache file '%s' to blob '%s' (%s bytes transferred in %s sec)",
- source_file,
- rel_path,
- os.path.getsize(source_file),
- end_time - start_time,
- )
- return True
+ # Download the file
+ self._download_to_file(key, local_file_path)
+ def _push_string_to_path(self, rel_path: str, from_string: str) -> bool:
+ try:
+ self._blob_client(rel_path).upload_blob(from_string, overwrite=True)
+ return True
except AzureHttpError:
- log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file)
- return False
-
- ##################
- # Public Methods #
- ##################
-
- def _exists(self, obj, **kwargs):
- in_cache = in_azure = False
- rel_path = self._construct_path(obj, **kwargs)
- dir_only = kwargs.get("dir_only", False)
- base_dir = kwargs.get("base_dir", None)
+ log.exception("Trouble pushing to Azure Blob '%s' from string", rel_path)
+ return False
- # check job work directory stuff early to skip API hits.
- if dir_only and base_dir:
- if not os.path.exists(rel_path):
- os.makedirs(rel_path, exist_ok=True)
+ def _push_file_to_path(self, rel_path: str, source_file: str) -> bool:
+ try:
+ with open(source_file, "rb") as f:
+ kwd = {}
+ max_concurrency = self.transfer_dict.get("upload_max_concurrency") or self.transfer_dict.get(
+ "max_concurrency"
+ )
+ if max_concurrency is not None:
+ kwd["max_concurrency"] = max_concurrency
+ self._blob_client(rel_path).upload_blob(f, overwrite=True, **kwd)
return True
+ except AzureHttpError:
+ log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file)
+ return False
- in_cache = self._in_cache(rel_path)
- in_azure = self._in_azure(rel_path)
- # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure))
- # dir_only does not get synced so shortcut the decision
- dir_only = kwargs.get("dir_only", False)
- base_dir = kwargs.get("base_dir", None)
- if dir_only:
- if in_cache or in_azure:
- return True
- else:
- return False
-
- # TODO: Sync should probably not be done here. Add this to an async upload stack?
- if in_cache and not in_azure:
- self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path))
- return True
- elif in_azure:
+ def _delete_remote_all(self, rel_path: str) -> bool:
+ try:
+ blobs = self._blobs_from(rel_path)
+ for blob in blobs:
+ log.debug("Deleting from Azure: %s", blob)
+ self._blob_client(blob.name).delete_blob()
return True
- else:
+ except AzureHttpError:
+ log.exception("Could not delete blob '%s' from Azure", rel_path)
return False
- def file_ready(self, obj, **kwargs):
- """
- A helper method that checks if a file corresponding to a dataset is
- ready and available to be used. Return ``True`` if so, ``False`` otherwise.
- """
- rel_path = self._construct_path(obj, **kwargs)
- # Make sure the size in cache is available in its entirety
- if self._in_cache(rel_path):
- local_size = os.path.getsize(self._get_cache_path(rel_path))
- remote_size = self._get_size_in_azure(rel_path)
- if local_size == remote_size:
- return True
- else:
- log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, local_size, remote_size)
-
- return False
-
- def _create(self, obj, **kwargs):
- if not self._exists(obj, **kwargs):
- # Pull out locally used fields
- extra_dir = kwargs.get("extra_dir", None)
- extra_dir_at_root = kwargs.get("extra_dir_at_root", False)
- dir_only = kwargs.get("dir_only", False)
- alt_name = kwargs.get("alt_name", None)
-
- # Construct hashed path
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
-
- # Optionally append extra_dir
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # Create given directory in cache
- cache_dir = os.path.join(self.staging_path, rel_path)
- if not os.path.exists(cache_dir):
- os.makedirs(cache_dir, exist_ok=True)
-
- # Although not really necessary to create S3 folders (because S3 has
- # flat namespace), do so for consistency with the regular file system
- # S3 folders are marked by having trailing '/' so add it now
- # s3_dir = '%s/' % rel_path
- # self._push_to_os(s3_dir, from_string='')
- # If instructed, create the dataset in cache & in S3
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
- open(os.path.join(self.staging_path, rel_path), "w").close()
- self._push_to_os(rel_path, from_string="")
- return self
-
- def _empty(self, obj, **kwargs):
- if self._exists(obj, **kwargs):
- size = self._size(obj, **kwargs)
- is_empty = bool(size == 0)
- return is_empty
- else:
- raise ObjectNotFound(f"objectstore.empty, object does not exist: {str(obj)}, kwargs: {str(kwargs)}")
-
- def _size(self, obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- if self._in_cache(rel_path):
- try:
- return os.path.getsize(self._get_cache_path(rel_path))
- except OSError as ex:
- log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex)
- elif self._exists(obj, **kwargs):
- return self._get_size_in_azure(rel_path)
- log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
- return 0
-
- def _delete(self, obj, entire_dir=False, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- extra_dir = kwargs.get("extra_dir", None)
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
+ def _delete_existing_remote(self, rel_path: str) -> bool:
try:
- if base_dir and dir_only and obj_dir:
- # Remove temporary data in JOB_WORK directory
- shutil.rmtree(os.path.abspath(rel_path))
- return True
-
- # For the case of extra_files, because we don't have a reference to
- # individual files/blobs we need to remove the entire directory structure
- # with all the files in it. This is easy for the local file system,
- # but requires iterating through each individual blob in Azure and deleing it.
- if entire_dir and extra_dir:
- shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True)
- blobs = self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path)
- for blob in blobs:
- log.debug("Deleting from Azure: %s", blob)
- self._blob_client(blob.name).delete_blob()
- return True
- else:
- # Delete from cache first
- unlink(self._get_cache_path(rel_path), ignore_errors=True)
- # Delete from S3 as well
- if self._in_azure(rel_path):
- log.debug("Deleting from Azure: %s", rel_path)
- self._blob_client(rel_path).delete_blob()
- return True
+ self._blob_client(rel_path).delete_blob()
+ return True
except AzureHttpError:
log.exception("Could not delete blob '%s' from Azure", rel_path)
- except OSError:
- log.exception("%s delete error", self._get_filename(obj, **kwargs))
- return False
-
- def _get_data(self, obj, start=0, count=-1, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- # Check cache first and get file if not there
- if not self._in_cache(rel_path):
- self._pull_into_cache(rel_path)
- # Read the file content from cache
- data_file = open(self._get_cache_path(rel_path))
- data_file.seek(start)
- content = data_file.read(count)
- data_file.close()
- return content
-
- def _get_filename(self, obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
- sync_cache = kwargs.get("sync_cache", True)
-
- # for JOB_WORK directory
- if base_dir and dir_only and obj_dir:
- return os.path.abspath(rel_path)
-
- cache_path = self._get_cache_path(rel_path)
- if not sync_cache:
- return cache_path
- # Check if the file exists in the cache first, always pull if file size in cache is zero
- if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0):
- return cache_path
- # Check if the file exists in persistent storage and, if it does, pull it into cache
- elif self._exists(obj, **kwargs):
- if dir_only: # Directories do not get pulled into cache
- return cache_path
- else:
- if self._pull_into_cache(rel_path):
- return cache_path
- # For the case of retrieving a directory only, return the expected path
- # even if it does not exist.
- # if dir_only:
- # return cache_path
- raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {str(obj)}, kwargs: {str(kwargs)}")
-
- def _update_from_file(self, obj, file_name=None, create=False, **kwargs):
- if create is True:
- self._create(obj, **kwargs)
-
- if self._exists(obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- # Chose whether to use the dataset file itself or an alternate file
- if file_name:
- source_file = os.path.abspath(file_name)
- # Copy into cache
- cache_file = self._get_cache_path(rel_path)
- try:
- if source_file != cache_file and self.cache_updated_data:
- # FIXME? Should this be a `move`?
- shutil.copy2(source_file, cache_file)
- self._fix_permissions(cache_file)
- except OSError:
- log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
- else:
- source_file = self._get_cache_path(rel_path)
-
- self._push_to_os(rel_path, source_file)
-
- else:
- raise ObjectNotFound(
- f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}"
- )
+ return False
def _get_object_url(self, obj, **kwargs):
if self._exists(obj, **kwargs):
@@ -593,13 +332,5 @@ def _get_store_usage_percent(self, obj):
# https://learn.microsoft.com/en-us/azure/storage/blobs/scalability-targets
return 0.0
- @property
- def cache_target(self) -> CacheTarget:
- return CacheTarget(
- self.staging_path,
- self.cache_size,
- 0.9,
- )
-
def shutdown(self):
- self.cache_monitor and self.cache_monitor.shutdown()
+ self._shutdown_cache_monitor()
diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py
index 0a7f80e37b31..79af7a6df8ad 100644
--- a/lib/galaxy/objectstore/cloud.py
+++ b/lib/galaxy/objectstore/cloud.py
@@ -3,30 +3,12 @@
"""
import logging
-import multiprocessing
import os
import os.path
-import shutil
-import subprocess
-from datetime import datetime
-from typing import Optional
-
-from galaxy.exceptions import (
- ObjectInvalid,
- ObjectNotFound,
-)
-from galaxy.util import (
- directory_hash_id,
- safe_relpath,
- umask_fix_perms,
- unlink,
-)
-from . import ConcreteObjectStore
-from .caching import (
- CacheTarget,
- enable_cache_monitor,
- InProcessCacheMonitor,
-)
+
+from ._caching_base import CachingConcreteObjectStore
+from ._util import UsesAxel
+from .caching import enable_cache_monitor
from .s3 import parse_config_xml
try:
@@ -47,36 +29,17 @@
)
-class CloudConfigMixin:
- def _config_to_dict(self):
- return {
- "provider": self.provider,
- "auth": self.credentials,
- "bucket": {
- "name": self.bucket_name,
- "use_reduced_redundancy": self.use_rr,
- },
- "cache": {
- "size": self.cache_size,
- "path": self.staging_path,
- "cache_updated_data": self.cache_updated_data,
- },
- }
-
-
-class Cloud(ConcreteObjectStore, CloudConfigMixin):
+class Cloud(CachingConcreteObjectStore, UsesAxel):
"""
Object store that stores objects as items in an cloud storage. A local
cache exists that is used as an intermediate location for files between
Galaxy and the cloud storage.
"""
- cache_monitor: Optional[InProcessCacheMonitor] = None
store_type = "cloud"
def __init__(self, config, config_dict):
super().__init__(config, config_dict)
- self.transfer_progress = 0
bucket_dict = config_dict["bucket"]
cache_dict = config_dict.get("cache") or {}
@@ -100,17 +63,9 @@ def _initialize(self):
self.conn = self._get_connection(self.provider, self.credentials)
self.bucket = self._get_bucket(self.bucket_name)
- self.start_cache_monitor()
- # Test if 'axel' is available for parallel download and pull the key into cache
- try:
- subprocess.call("axel")
- self.use_axel = True
- except OSError:
- self.use_axel = False
-
- def start_cache_monitor(self):
- if self.enable_cache_monitor:
- self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval)
+ self._ensure_staging_path_writable()
+ self._start_cache_monitor_if_needed()
+ self._init_axel()
@staticmethod
def _get_connection(provider, credentials):
@@ -235,13 +190,20 @@ def to_dict(self):
as_dict.update(self._config_to_dict())
return as_dict
- @property
- def cache_target(self) -> CacheTarget:
- return CacheTarget(
- self.staging_path,
- self.cache_size,
- 0.9,
- )
+ def _config_to_dict(self):
+ return {
+ "provider": self.provider,
+ "auth": self.credentials,
+ "bucket": {
+ "name": self.bucket_name,
+ "use_reduced_redundancy": self.use_rr,
+ },
+ "cache": {
+ "size": self.cache_size,
+ "path": self.staging_path,
+ "cache_updated_data": self.cache_updated_data,
+ },
+ }
def _get_bucket(self, bucket_name):
try:
@@ -260,75 +222,7 @@ def _get_bucket(self, bucket_name):
log.exception(f"Could not get bucket '{bucket_name}'")
raise Exception
- def _fix_permissions(self, rel_path):
- """Set permissions on rel_path"""
- for basedir, _, files in os.walk(rel_path):
- umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
- for filename in files:
- path = os.path.join(basedir, filename)
- # Ignore symlinks
- if os.path.islink(path):
- continue
- umask_fix_perms(path, self.config.umask, 0o666, self.config.gid)
-
- def _construct_path(
- self,
- obj,
- base_dir=None,
- dir_only=None,
- extra_dir=None,
- extra_dir_at_root=False,
- alt_name=None,
- obj_dir=False,
- in_cache=False,
- **kwargs,
- ):
- # extra_dir should never be constructed from provided data but just
- # make sure there are no shenannigans afoot
- if extra_dir and extra_dir != os.path.normpath(extra_dir):
- log.warning("extra_dir is not normalized: %s", extra_dir)
- raise ObjectInvalid("The requested object is invalid")
- # ensure that any parent directory references in alt_name would not
- # result in a path not contained in the directory path constructed here
- if alt_name:
- if not safe_relpath(alt_name):
- log.warning("alt_name would locate path outside dir: %s", alt_name)
- raise ObjectInvalid("The requested object is invalid")
- # alt_name can contain parent directory references, but S3 will not
- # follow them, so if they are valid we normalize them out
- alt_name = os.path.normpath(alt_name)
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # for JOB_WORK directory
- if obj_dir:
- rel_path = os.path.join(rel_path, str(self._get_object_id(obj)))
- if base_dir:
- base = self.extra_dirs.get(base_dir)
- return os.path.join(base, rel_path)
-
- # S3 folders are marked by having trailing '/' so add it now
- rel_path = f"{rel_path}/"
-
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
-
- if in_cache:
- return self._get_cache_path(rel_path)
-
- return rel_path
-
- def _get_cache_path(self, rel_path):
- return os.path.abspath(os.path.join(self.staging_path, rel_path))
-
- def _get_transfer_progress(self):
- return self.transfer_progress
-
- def _get_size_in_cloud(self, rel_path):
+ def _get_remote_size(self, rel_path):
try:
obj = self.bucket.objects.get(rel_path)
return obj.size
@@ -336,7 +230,7 @@ def _get_size_in_cloud(self, rel_path):
log.exception("Could not get size of key '%s' from S3", rel_path)
return -1
- def _key_exists(self, rel_path):
+ def _exists_remotely(self, rel_path):
exists = False
try:
# A hackish way of testing if the rel_path is a folder vs a file
@@ -354,322 +248,87 @@ def _key_exists(self, rel_path):
return False
return exists
- def _in_cache(self, rel_path):
- """Check if the given dataset is in the local cache and return True if so."""
- # log.debug("------ Checking cache for rel_path %s" % rel_path)
- cache_path = self._get_cache_path(rel_path)
- return os.path.exists(cache_path)
-
- def _pull_into_cache(self, rel_path):
- # Ensure the cache directory structure exists (e.g., dataset_#_files/)
- rel_path_dir = os.path.dirname(rel_path)
- if not os.path.exists(self._get_cache_path(rel_path_dir)):
- os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True)
- # Now pull in the file
- file_ok = self._download(rel_path)
- self._fix_permissions(self._get_cache_path(rel_path_dir))
- return file_ok
-
- def _transfer_cb(self, complete, total):
- self.transfer_progress += 10
-
def _download(self, rel_path):
+ local_destination = self._get_cache_path(rel_path)
try:
- log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
+ log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination)
key = self.bucket.objects.get(rel_path)
- # Test if cache is large enough to hold the new file
- if not self.cache_target.fits_in_cache(key.size):
- log.critical(
- "File %s is larger (%s) than the configured cache allows (%s). Cannot download.",
- rel_path,
- key.size,
- self.cache_target.log_description,
- )
+ remote_size = key.size
+ if not self._caching_allowed(rel_path, remote_size):
return False
- if self.use_axel:
- log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
- ncores = multiprocessing.cpu_count()
- url = key.generate_url(7200)
- ret_code = subprocess.call(f"axel -a -n {ncores} '{url}'")
- if ret_code == 0:
- return True
- else:
- log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
- self.transfer_progress = 0 # Reset transfer progress counter
- with open(self._get_cache_path(rel_path), "wb+") as downloaded_file_handle:
- key.save_content(downloaded_file_handle)
- return True
+ log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination)
+ self._download_to(key, local_destination)
+ return True
except Exception:
log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name)
return False
- def _push_to_os(self, rel_path, source_file=None, from_string=None):
- """
- Push the file pointed to by ``rel_path`` to the object store naming the key
- ``rel_path``. If ``source_file`` is provided, push that file instead while
- still using ``rel_path`` as the key name.
- If ``from_string`` is provided, set contents of the file to the value of
- the string.
- """
+ def _download_directory_into_cache(self, rel_path, cache_path):
+ # List objects in the specified cloud folder
+ objects = self.bucket.objects.list(prefix=rel_path)
+
+ for obj in objects:
+ remote_file_path = obj.name
+ local_file_path = os.path.join(cache_path, os.path.relpath(remote_file_path, rel_path))
+
+ # Create directories if they don't exist
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+
+ # Download the file
+ self._download_to(obj, local_file_path)
+
+ def _download_to(self, key, local_destination):
+ if self.use_axel:
+ url = key.generate_url(7200)
+ return self._axel_download(url, local_destination)
+ else:
+ with open(local_destination, "wb+") as downloaded_file_handle:
+ key.save_content(downloaded_file_handle)
+
+ def _push_string_to_path(self, rel_path: str, from_string: str) -> bool:
try:
- source_file = source_file if source_file else self._get_cache_path(rel_path)
- if os.path.exists(source_file):
- if os.path.getsize(source_file) == 0 and (self.bucket.objects.get(rel_path) is not None):
- log.debug(
- "Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file, rel_path
- )
- return True
- if from_string:
- if not self.bucket.objects.get(rel_path):
- created_obj = self.bucket.objects.create(rel_path)
- created_obj.upload(source_file)
- else:
- self.bucket.objects.get(rel_path).upload(source_file)
- log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path)
- else:
- start_time = datetime.now()
- log.debug(
- "Pushing cache file '%s' of size %s bytes to key '%s'",
- source_file,
- os.path.getsize(source_file),
- rel_path,
- )
- self.transfer_progress = 0 # Reset transfer progress counter
- if not self.bucket.objects.get(rel_path):
- created_obj = self.bucket.objects.create(rel_path)
- created_obj.upload_from_file(source_file)
- else:
- self.bucket.objects.get(rel_path).upload_from_file(source_file)
-
- end_time = datetime.now()
- log.debug(
- "Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)",
- source_file,
- rel_path,
- os.path.getsize(source_file),
- end_time - start_time,
- )
- return True
+ if not self.bucket.objects.get(rel_path):
+ created_obj = self.bucket.objects.create(rel_path)
+ created_obj.upload(from_string)
else:
- log.error(
- "Tried updating key '%s' from source file '%s', but source file does not exist.",
- rel_path,
- source_file,
- )
+ self.bucket.objects.get(rel_path).upload(from_string)
+ return True
except Exception:
- log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file)
- return False
-
- def file_ready(self, obj, **kwargs):
- """
- A helper method that checks if a file corresponding to a dataset is
- ready and available to be used. Return ``True`` if so, ``False`` otherwise.
- """
- rel_path = self._construct_path(obj, **kwargs)
- # Make sure the size in cache is available in its entirety
- if self._in_cache(rel_path):
- if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_cloud(rel_path):
- return True
- log.debug(
- "Waiting for dataset %s to transfer from OS: %s/%s",
- rel_path,
- os.path.getsize(self._get_cache_path(rel_path)),
- self._get_size_in_cloud(rel_path),
- )
- return False
+ log.exception("Trouble pushing to cloud '%s' from string", rel_path)
+ return False
- def _exists(self, obj, **kwargs):
- in_cache = False
- rel_path = self._construct_path(obj, **kwargs)
-
- # Check cache
- if self._in_cache(rel_path):
- in_cache = True
- # Check cloud
- in_cloud = self._key_exists(rel_path)
- # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3))
- # dir_only does not get synced so shortcut the decision
- dir_only = kwargs.get("dir_only", False)
- base_dir = kwargs.get("base_dir", None)
- if dir_only:
- if in_cache or in_cloud:
- return True
- # for JOB_WORK directory
- elif base_dir:
- if not os.path.exists(rel_path):
- os.makedirs(rel_path, exist_ok=True)
- return True
+ def _push_file_to_path(self, rel_path: str, source_file: str) -> bool:
+ try:
+ if not self.bucket.objects.get(rel_path):
+ created_obj = self.bucket.objects.create(rel_path)
+ created_obj.upload_from_file(source_file)
else:
- return False
-
- # TODO: Sync should probably not be done here. Add this to an async upload stack?
- if in_cache and not in_cloud:
- self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path))
- return True
- elif in_cloud:
+ self.bucket.objects.get(rel_path).upload_from_file(source_file)
return True
- else:
+ except Exception:
+ log.exception("Trouble pushing to cloud '%s' from file '%s'", rel_path, source_file)
return False
- def _create(self, obj, **kwargs):
- if not self._exists(obj, **kwargs):
- # Pull out locally used fields
- extra_dir = kwargs.get("extra_dir", None)
- extra_dir_at_root = kwargs.get("extra_dir_at_root", False)
- dir_only = kwargs.get("dir_only", False)
- alt_name = kwargs.get("alt_name", None)
-
- # Construct hashed path
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
-
- # Optionally append extra_dir
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # Create given directory in cache
- cache_dir = os.path.join(self.staging_path, rel_path)
- if not os.path.exists(cache_dir):
- os.makedirs(cache_dir, exist_ok=True)
-
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
- open(os.path.join(self.staging_path, rel_path), "w").close()
- self._push_to_os(rel_path, from_string="")
- return self
-
- def _empty(self, obj, **kwargs):
- if self._exists(obj, **kwargs):
- return bool(self._size(obj, **kwargs) == 0)
- else:
- raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}")
-
- def _size(self, obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- if self._in_cache(rel_path):
- try:
- return os.path.getsize(self._get_cache_path(rel_path))
- except OSError as ex:
- log.info("Could not get size of file '%s' in local cache, will try cloud. Error: %s", rel_path, ex)
- elif self._exists(obj, **kwargs):
- return self._get_size_in_cloud(rel_path)
- log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
- return 0
-
- def _delete(self, obj, entire_dir=False, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- extra_dir = kwargs.get("extra_dir", None)
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
+ def _delete_remote_all(self, rel_path: str) -> bool:
try:
- # Remove temparory data in JOB_WORK directory
- if base_dir and dir_only and obj_dir:
- shutil.rmtree(os.path.abspath(rel_path))
- return True
-
- # For the case of extra_files, because we don't have a reference to
- # individual files/keys we need to remove the entire directory structure
- # with all the files in it. This is easy for the local file system,
- # but requires iterating through each individual key in S3 and deleing it.
- if entire_dir and extra_dir:
- shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True)
- results = self.bucket.objects.list(prefix=rel_path)
- for key in results:
- log.debug("Deleting key %s", key.name)
- key.delete()
- return True
- else:
- # Delete from cache first
- unlink(self._get_cache_path(rel_path), ignore_errors=True)
- # Delete from S3 as well
- if self._key_exists(rel_path):
- key = self.bucket.objects.get(rel_path)
- log.debug("Deleting key %s", key.name)
- key.delete()
- return True
+ results = self.bucket.objects.list(prefix=rel_path)
+ for key in results:
+ log.debug("Deleting key %s", key.name)
+ key.delete()
+ return True
except Exception:
log.exception("Could not delete key '%s' from cloud", rel_path)
- except OSError:
- log.exception("%s delete error", self._get_filename(obj, **kwargs))
- return False
+ return False
- def _get_data(self, obj, start=0, count=-1, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- # Check cache first and get file if not there
- if not self._in_cache(rel_path):
- self._pull_into_cache(rel_path)
- # Read the file content from cache
- data_file = open(self._get_cache_path(rel_path))
- data_file.seek(start)
- content = data_file.read(count)
- data_file.close()
- return content
-
- def _get_filename(self, obj, **kwargs):
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
- rel_path = self._construct_path(obj, **kwargs)
- sync_cache = kwargs.get("sync_cache", True)
-
- # for JOB_WORK directory
- if base_dir and dir_only and obj_dir:
- return os.path.abspath(rel_path)
-
- cache_path = self._get_cache_path(rel_path)
- if not sync_cache:
- return cache_path
- # S3 does not recognize directories as files so cannot check if those exist.
- # So, if checking dir only, ensure given dir exists in cache and return
- # the expected cache path.
- # dir_only = kwargs.get('dir_only', False)
- # if dir_only:
- # if not os.path.exists(cache_path):
- # os.makedirs(cache_path)
- # return cache_path
- # Check if the file exists in the cache first, always pull if file size in cache is zero
- if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0):
- return cache_path
- # Check if the file exists in persistent storage and, if it does, pull it into cache
- elif self._exists(obj, **kwargs):
- if dir_only: # Directories do not get pulled into cache
- return cache_path
- else:
- if self._pull_into_cache(rel_path):
- return cache_path
- # For the case of retrieving a directory only, return the expected path
- # even if it does not exist.
- # if dir_only:
- # return cache_path
- raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}")
- # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path
-
- def _update_from_file(self, obj, file_name=None, create=False, **kwargs):
- if create:
- self._create(obj, **kwargs)
- if self._exists(obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- # Chose whether to use the dataset file itself or an alternate file
- if file_name:
- source_file = os.path.abspath(file_name)
- # Copy into cache
- cache_file = self._get_cache_path(rel_path)
- try:
- if source_file != cache_file and self.cache_updated_data:
- # FIXME? Should this be a `move`?
- shutil.copy2(source_file, cache_file)
- self._fix_permissions(cache_file)
- except OSError:
- log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
- else:
- source_file = self._get_cache_path(rel_path)
- # Update the file on cloud
- self._push_to_os(rel_path, source_file)
- else:
- raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}")
+ def _delete_existing_remote(self, rel_path: str) -> bool:
+ try:
+ key = self.bucket.objects.get(rel_path)
+ log.debug("Deleting key %s", key.name)
+ key.delete()
+ return True
+ except Exception:
+ log.exception("Could not delete key '%s' from cloud", rel_path)
+ return False
def _get_object_url(self, obj, **kwargs):
if self._exists(obj, **kwargs):
@@ -685,4 +344,4 @@ def _get_store_usage_percent(self, obj):
return 0.0
def shutdown(self):
- self.cache_monitor and self.cache_monitor.shutdown()
+ self._shutdown_cache_monitor()
diff --git a/lib/galaxy/objectstore/examples/__init__.py b/lib/galaxy/objectstore/examples/__init__.py
new file mode 100644
index 000000000000..42b05b2d6eb2
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/__init__.py
@@ -0,0 +1,5 @@
+from galaxy.util.resources import resource_string
+
+
+def get_example(filename: str) -> str:
+ return resource_string("galaxy.objectstore.examples", filename)
diff --git a/lib/galaxy/objectstore/examples/aws_s3_integration_test.yml b/lib/galaxy/objectstore/examples/aws_s3_integration_test.yml
new file mode 100644
index 000000000000..da3bd8ae3e04
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/aws_s3_integration_test.yml
@@ -0,0 +1,17 @@
+type: aws_s3
+store_by: uuid
+auth:
+ access_key: ${GALAXY_TEST_AWS_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_AWS_SECRET_KEY}
+
+bucket:
+ name: ${GALAXY_TEST_AWS_BUCKET}
+
+connection:
+ region: ${GALAXY_TEST_AWS_REGION}
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/azure_default_cache.xml b/lib/galaxy/objectstore/examples/azure_default_cache.xml
new file mode 100644
index 000000000000..c9b95b72f62b
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_default_cache.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/azure_default_cache.yml b/lib/galaxy/objectstore/examples/azure_default_cache.yml
new file mode 100644
index 000000000000..8f1eb80e22af
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_default_cache.yml
@@ -0,0 +1,14 @@
+type: azure_blob
+auth:
+ account_name: azureact
+ account_key: password123
+
+container:
+ name: unique_container_name
+ max_chunk_size: 250
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/azure_integration_test.yml b/lib/galaxy/objectstore/examples/azure_integration_test.yml
new file mode 100644
index 000000000000..d44544856a2c
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_integration_test.yml
@@ -0,0 +1,14 @@
+type: azure_blob
+store_by: uuid
+auth:
+ account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME}
+ account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY}
+
+container:
+ name: ${GALAXY_TEST_AZURE_CONTAINER_NAME}
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/azure_integration_test_distributed.yml b/lib/galaxy/objectstore/examples/azure_integration_test_distributed.yml
new file mode 100644
index 000000000000..d8de569819a6
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_integration_test_distributed.yml
@@ -0,0 +1,38 @@
+type: distributed
+backends:
+- type: azure_blob
+ id: azure1
+ store_by: uuid
+ name: Azure Store 1
+ allow_selection: true
+ weight: 1
+ auth:
+ account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME}
+ account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY}
+
+ container:
+ name: ${GALAXY_TEST_AZURE_CONTAINER_NAME}
+
+ extra_dirs:
+ - type: job_work
+ path: database/job_working_directory_azure_1
+ - type: temp
+ path: database/tmp_azure_1
+- type: azure_blob
+ id: azure2
+ store_by: uuid
+ name: Azure Store 2
+ allow_selection: true
+ weight: 1
+ auth:
+ account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME}
+ account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY}
+
+ container:
+ name: ${GALAXY_TEST_AZURE_CONTAINER_NAME}
+
+ extra_dirs:
+ - type: job_work
+ path: database/job_working_directory_azure_2
+ - type: temp
+ path: database/tmp_azure_2
diff --git a/lib/galaxy/objectstore/examples/azure_integration_test_with_account_url.yml b/lib/galaxy/objectstore/examples/azure_integration_test_with_account_url.yml
new file mode 100644
index 000000000000..e8cfcee2ecf9
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_integration_test_with_account_url.yml
@@ -0,0 +1,15 @@
+type: azure_blob
+store_by: uuid
+auth:
+ account_name: ${GALAXY_TEST_AZURE_ACCOUNT_NAME}
+ account_key: ${GALAXY_TEST_AZURE_ACCOUNT_KEY}
+ account_url: ${GALAXY_TEST_AZURE_ACCOUNT_URL}
+
+container:
+ name: ${GALAXY_TEST_AZURE_CONTAINER_NAME}
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/azure_simple.xml b/lib/galaxy/objectstore/examples/azure_simple.xml
new file mode 100644
index 000000000000..aae420d633a6
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_simple.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/azure_simple.yml b/lib/galaxy/objectstore/examples/azure_simple.yml
new file mode 100644
index 000000000000..b2a57ed099fe
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_simple.yml
@@ -0,0 +1,17 @@
+type: azure_blob
+auth:
+ account_name: azureact
+ account_key: password123
+
+container:
+ name: unique_container_name
+
+cache:
+ path: database/object_store_cache
+ size: 100
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/azure_transfer.xml b/lib/galaxy/objectstore/examples/azure_transfer.xml
new file mode 100644
index 000000000000..bab8b2913c63
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_transfer.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/azure_transfer.yml b/lib/galaxy/objectstore/examples/azure_transfer.yml
new file mode 100644
index 000000000000..2e29f85081db
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/azure_transfer.yml
@@ -0,0 +1,24 @@
+type: azure_blob
+auth:
+ account_name: azureact
+ account_key: password123
+
+container:
+ name: unique_container_name
+
+cache:
+ path: database/object_store_cache
+ size: 100
+
+transfer:
+ download_max_concurrency: 1
+ upload_max_concurrency: 2
+ max_single_put_size: 10
+ max_single_get_size: 20
+ max_block_size: 3
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/boto3_custom_connection.xml b/lib/galaxy/objectstore/examples/boto3_custom_connection.xml
new file mode 100644
index 000000000000..7256007d9ab1
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_custom_connection.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/boto3_custom_connection.yml b/lib/galaxy/objectstore/examples/boto3_custom_connection.yml
new file mode 100644
index 000000000000..a4485dcc09ef
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_custom_connection.yml
@@ -0,0 +1,21 @@
+type: boto3
+auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+connection:
+ endpoint_url: https://s3.example.org/
+ region: the_example_region
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_s3
+- type: temp
+ path: database/tmp_s3
diff --git a/lib/galaxy/objectstore/examples/boto3_integration_test_aws.yml b/lib/galaxy/objectstore/examples/boto3_integration_test_aws.yml
new file mode 100644
index 000000000000..e3fe42ca027c
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_integration_test_aws.yml
@@ -0,0 +1,14 @@
+type: boto3
+store_by: uuid
+auth:
+ access_key: ${GALAXY_TEST_AWS_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_AWS_SECRET_KEY}
+
+bucket:
+ name: ${GALAXY_TEST_AWS_BUCKET}
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/boto3_integration_test_aws_new_bucket.yml b/lib/galaxy/objectstore/examples/boto3_integration_test_aws_new_bucket.yml
new file mode 100644
index 000000000000..9a5ccded34aa
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_integration_test_aws_new_bucket.yml
@@ -0,0 +1,14 @@
+type: boto3
+store_by: uuid
+auth:
+ access_key: ${GALAXY_TEST_AWS_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_AWS_SECRET_KEY}
+
+bucket:
+ name: mycoolbucket${test_random_int}
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/boto3_integration_test_multithreaded.yml b/lib/galaxy/objectstore/examples/boto3_integration_test_multithreaded.yml
new file mode 100644
index 000000000000..fc4932e34700
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_integration_test_multithreaded.yml
@@ -0,0 +1,17 @@
+type: boto3
+store_by: uuid
+auth:
+ access_key: ${GALAXY_TEST_AWS_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_AWS_SECRET_KEY}
+
+bucket:
+ name: ${GALAXY_TEST_AWS_BUCKET}
+
+transfer:
+ multipart_threshold: 10
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.xml b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.xml
new file mode 100644
index 000000000000..791e9456607c
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.yml b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.yml
new file mode 100644
index 000000000000..3e84dd15c191
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_merged_transfer_options.yml
@@ -0,0 +1,27 @@
+type: boto3
+auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+transfer:
+ multipart_threshold: 13
+ max_concurrency: 13
+ multipart_chunksize: 13
+ num_download_attempts: 13
+ max_io_queue: 13
+ io_chunksize: 13
+ use_threads: false
+ max_bandwidth: 13
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_s3
+- type: temp
+ path: database/tmp_s3
diff --git a/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.xml b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.xml
new file mode 100644
index 000000000000..bec5184c3e00
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.xml
@@ -0,0 +1,24 @@
+
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.yml b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.yml
new file mode 100644
index 000000000000..044416e5f2bf
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_separated_transfer_options.yml
@@ -0,0 +1,35 @@
+type: boto3
+auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+transfer:
+ upload_multipart_threshold: 13
+ upload_max_concurrency: 13
+ upload_multipart_chunksize: 13
+ upload_num_download_attempts: 13
+ upload_max_io_queue: 13
+ upload_io_chunksize: 13
+ upload_use_threads: false
+ upload_max_bandwidth: 13
+ download_multipart_threshold: 14
+ download_max_concurrency: 14
+ download_multipart_chunksize: 14
+ download_num_download_attempts: 14
+ download_max_io_queue: 14
+ download_io_chunksize: 14
+ download_use_threads: true
+ download_max_bandwidth: 14
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_s3
+- type: temp
+ path: database/tmp_s3
diff --git a/lib/galaxy/objectstore/examples/boto3_simple.xml b/lib/galaxy/objectstore/examples/boto3_simple.xml
new file mode 100644
index 000000000000..c145405d7689
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_simple.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/boto3_simple.yml b/lib/galaxy/objectstore/examples/boto3_simple.yml
new file mode 100644
index 000000000000..8e74986694b5
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/boto3_simple.yml
@@ -0,0 +1,17 @@
+type: boto3
+auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_s3
+- type: temp
+ path: database/tmp_s3
diff --git a/lib/galaxy/objectstore/examples/cloud_aws_default_cache.xml b/lib/galaxy/objectstore/examples/cloud_aws_default_cache.xml
new file mode 100644
index 000000000000..4479fe70f8c1
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_aws_default_cache.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/cloud_aws_no_auth.xml b/lib/galaxy/objectstore/examples/cloud_aws_no_auth.xml
new file mode 100644
index 000000000000..9361987322ee
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_aws_no_auth.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/cloud_aws_simple.xml b/lib/galaxy/objectstore/examples/cloud_aws_simple.xml
new file mode 100644
index 000000000000..dc22faa2ea54
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_aws_simple.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/cloud_aws_simple.yml b/lib/galaxy/objectstore/examples/cloud_aws_simple.yml
new file mode 100644
index 000000000000..1f12ee10402c
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_aws_simple.yml
@@ -0,0 +1,19 @@
+type: cloud
+provider: aws
+auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+ use_reduced_redundancy: false
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_cloud
+- type: temp
+ path: database/tmp_cloud
diff --git a/lib/galaxy/objectstore/examples/cloud_azure_simple.xml b/lib/galaxy/objectstore/examples/cloud_azure_simple.xml
new file mode 100644
index 000000000000..4f69940bf371
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_azure_simple.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/cloud_azure_simple.yml b/lib/galaxy/objectstore/examples/cloud_azure_simple.yml
new file mode 100644
index 000000000000..abd0c87d9eab
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_azure_simple.yml
@@ -0,0 +1,21 @@
+type: cloud
+provider: azure
+auth:
+ subscription_id: a_sub_id
+ client_id: and_a_client_id
+ secret: and_a_secret_key
+ tenant: and_some_tenant_info
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+ use_reduced_redundancy: false
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_cloud
+- type: temp
+ path: database/tmp_cloud
diff --git a/lib/galaxy/objectstore/examples/cloud_gcp_simple.xml b/lib/galaxy/objectstore/examples/cloud_gcp_simple.xml
new file mode 100644
index 000000000000..9fcb683685cf
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_gcp_simple.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/cloud_gcp_simple.yml b/lib/galaxy/objectstore/examples/cloud_gcp_simple.yml
new file mode 100644
index 000000000000..c6cfa193291b
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_gcp_simple.yml
@@ -0,0 +1,18 @@
+type: cloud
+provider: google
+auth:
+ credentials_file: gcp.config
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+ use_reduced_redundancy: false
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_cloud
+- type: temp
+ path: database/tmp_cloud
diff --git a/lib/galaxy/objectstore/examples/cloud_integration_test_aws.yml b/lib/galaxy/objectstore/examples/cloud_integration_test_aws.yml
new file mode 100644
index 000000000000..f48047c190cd
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_integration_test_aws.yml
@@ -0,0 +1,15 @@
+type: cloud
+store_by: uuid
+provider: aws
+auth:
+ access_key: ${GALAXY_TEST_AWS_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_AWS_SECRET_KEY}
+
+bucket:
+ name: ${GALAXY_TEST_AWS_BUCKET}
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/cloud_integration_test_aws_with_region.yml b/lib/galaxy/objectstore/examples/cloud_integration_test_aws_with_region.yml
new file mode 100644
index 000000000000..135429e7029a
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/cloud_integration_test_aws_with_region.yml
@@ -0,0 +1,16 @@
+type: cloud
+store_by: uuid
+provider: aws
+auth:
+ access_key: ${GALAXY_TEST_AWS_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_AWS_SECRET_KEY}
+ region: ${GALAXY_TEST_AWS_REGION}
+
+bucket:
+ name: ${GALAXY_TEST_AWS_BUCKET}
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/disk_badges.xml b/lib/galaxy/objectstore/examples/disk_badges.xml
new file mode 100644
index 000000000000..6dc1f3a837ad
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/disk_badges.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+ Fast interconnects.
+
+
+ Storage is backed up to tape nightly.
+
+
diff --git a/lib/galaxy/objectstore/examples/disk_badges.yml b/lib/galaxy/objectstore/examples/disk_badges.yml
new file mode 100644
index 000000000000..5d0c23126dba
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/disk_badges.yml
@@ -0,0 +1,16 @@
+type: disk
+files_dir: "${temp_directory}/files1"
+store_by: uuid
+extra_dirs:
+ - type: temp
+ path: "${temp_directory}/tmp1"
+ - type: job_work
+ path: "${temp_directory}/job_working_directory1"
+badges:
+ - type: short_term
+ - type: faster
+ message: Fast interconnects.
+ - type: less_stable
+ - type: more_secure
+ - type: backed_up
+ message: Storage is backed up to tape nightly.
diff --git a/lib/galaxy/objectstore/examples/distributed_disk.xml b/lib/galaxy/objectstore/examples/distributed_disk.xml
new file mode 100644
index 000000000000..322e414e6041
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/distributed_disk.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/distributed_disk.yml b/lib/galaxy/objectstore/examples/distributed_disk.yml
new file mode 100644
index 000000000000..080d01f2c104
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/distributed_disk.yml
@@ -0,0 +1,26 @@
+type: distributed
+backends:
+ - id: files1
+ quota:
+ source: 1files
+ type: disk
+ weight: 2
+ device: primary_disk
+ files_dir: "${temp_directory}/files1"
+ extra_dirs:
+ - type: temp
+ path: "${temp_directory}/tmp1"
+ - type: job_work
+ path: "${temp_directory}/job_working_directory1"
+ - id: files2
+ quota:
+ source: 2files
+ type: disk
+ weight: 1
+ device: primary_disk
+ files_dir: "${temp_directory}/files2"
+ extra_dirs:
+ - type: temp
+ path: "${temp_directory}/tmp2"
+ - type: job_work
+ path: "${temp_directory}/job_working_directory2"
diff --git a/lib/galaxy/objectstore/examples/distributed_s3.yml b/lib/galaxy/objectstore/examples/distributed_s3.yml
new file mode 100644
index 000000000000..ec73647b9454
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/distributed_s3.yml
@@ -0,0 +1,34 @@
+type: distributed
+backends:
+ - id: files1
+ weight: 1
+ type: s3
+ auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+ bucket:
+ name: unique_bucket_name_all_lowercase
+ use_reduced_redundancy: false
+
+ extra_dirs:
+ - type: job_work
+ path: ${temp_directory}/job_working_directory_s3
+ - type: temp
+ path: ${temp_directory}/tmp_s3
+ - id: files2
+ weight: 1
+ type: s3
+ auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+ bucket:
+ name: unique_bucket_name_all_lowercase_2
+ use_reduced_redundancy: false
+
+ extra_dirs:
+ - type: job_work
+ path: ${temp_directory}/job_working_directory_s3_2
+ - type: temp
+ path: ${temp_directory}/tmp_s3_2
diff --git a/lib/galaxy/objectstore/examples/gcp_boto3_integration_test.yml b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test.yml
new file mode 100644
index 000000000000..a8ce3e2ce3c1
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test.yml
@@ -0,0 +1,17 @@
+type: boto3
+store_by: uuid
+auth:
+ access_key: ${GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY}
+
+bucket:
+ name: ${GALAXY_TEST_GOOGLE_BUCKET}
+
+connection:
+ endpoint_url: https://storage.googleapis.com
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/gcp_boto3_integration_test_legacy_params.yml b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test_legacy_params.yml
new file mode 100644
index 000000000000..04c342cb83e8
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/gcp_boto3_integration_test_legacy_params.yml
@@ -0,0 +1,20 @@
+type: boto3
+store_by: uuid
+auth:
+ access_key: ${GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY}
+
+bucket:
+ name: ${GALAXY_TEST_GOOGLE_BUCKET}
+
+connection:
+ host: storage.googleapis.com
+ port: 443
+ secure: true
+ conn_pat: '/'
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/gcp_s3_integration_test.yml b/lib/galaxy/objectstore/examples/gcp_s3_integration_test.yml
new file mode 100644
index 000000000000..44d071778533
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/gcp_s3_integration_test.yml
@@ -0,0 +1,18 @@
+type: generic_s3
+store_by: uuid
+auth:
+ access_key: ${GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY}
+ secret_key: ${GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY}
+
+bucket:
+ name: ${GALAXY_TEST_GOOGLE_BUCKET}
+
+connection:
+ host: storage.googleapis.com
+ port: 443
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_azure
+- type: temp
+ path: database/tmp_azure
diff --git a/lib/galaxy/objectstore/examples/hierarchical_simple.xml b/lib/galaxy/objectstore/examples/hierarchical_simple.xml
new file mode 100644
index 000000000000..2e8ec1051257
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/hierarchical_simple.xml
@@ -0,0 +1,23 @@
+
+
+
+
+
+ This is our new storage cluster, check out the storage
+ on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage).
+
+
+
+
+
+
+
+ This is our older legacy storage cluster, check out the storage
+ on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage).
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/hierarchical_simple.yml b/lib/galaxy/objectstore/examples/hierarchical_simple.yml
new file mode 100644
index 000000000000..1755b5c82099
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/hierarchical_simple.yml
@@ -0,0 +1,28 @@
+type: hierarchical
+backends:
+ - id: files1
+ name: Newer Cool Storage
+ description: |
+ This is our new storage cluster, check out the storage
+ on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage).
+ type: disk
+ weight: 1
+ files_dir: "${temp_directory}/files1"
+ extra_dirs:
+ - type: temp
+ path: "${temp_directory}/tmp1"
+ - type: job_work
+ path: "${temp_directory}/job_working_directory1"
+ - id: files2
+ name: Older Legacy Storage
+ description: |
+ This is our older legacy storage cluster, check out the storage
+ on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage).
+ type: disk
+ weight: 1
+ files_dir: "${temp_directory}/files2"
+ extra_dirs:
+ - type: temp
+ path: "${temp_directory}/tmp2"
+ - type: job_work
+ path: "${temp_directory}/job_working_directory2"
diff --git a/lib/galaxy/objectstore/examples/pithos_simple.xml b/lib/galaxy/objectstore/examples/pithos_simple.xml
new file mode 100644
index 000000000000..d7a5c30f11b1
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/pithos_simple.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/pithos_simple.yml b/lib/galaxy/objectstore/examples/pithos_simple.yml
new file mode 100644
index 000000000000..86bd6c2cc965
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/pithos_simple.yml
@@ -0,0 +1,14 @@
+type: pithos
+auth:
+ url: http://example.org/
+ token: extoken123
+
+container:
+ name: foo
+ project: cow
+
+extra_dirs:
+ - type: temp
+ path: database/tmp_pithos
+ - type: job_work
+ path: database/working_pithos
diff --git a/lib/galaxy/objectstore/examples/s3_global_cache.xml b/lib/galaxy/objectstore/examples/s3_global_cache.xml
new file mode 100644
index 000000000000..ec1a4e40ea4c
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/s3_global_cache.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff --git a/lib/galaxy/objectstore/examples/s3_global_cache.yml b/lib/galaxy/objectstore/examples/s3_global_cache.yml
new file mode 100644
index 000000000000..de08a7142f43
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/s3_global_cache.yml
@@ -0,0 +1,15 @@
+type: s3
+private: true
+auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+ use_reduced_redundancy: false
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_s3
+- type: temp
+ path: database/tmp_s3
diff --git a/lib/galaxy/objectstore/examples/s3_simple.xml b/lib/galaxy/objectstore/examples/s3_simple.xml
new file mode 100644
index 000000000000..c64c618021f3
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/s3_simple.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/lib/galaxy/objectstore/examples/s3_simple.yml b/lib/galaxy/objectstore/examples/s3_simple.yml
new file mode 100644
index 000000000000..b56a251b9c29
--- /dev/null
+++ b/lib/galaxy/objectstore/examples/s3_simple.yml
@@ -0,0 +1,19 @@
+type: s3
+private: true
+auth:
+ access_key: access_moo
+ secret_key: secret_cow
+
+bucket:
+ name: unique_bucket_name_all_lowercase
+ use_reduced_redundancy: false
+
+cache:
+ path: database/object_store_cache
+ size: 1000
+
+extra_dirs:
+- type: job_work
+ path: database/job_working_directory_s3
+- type: temp
+ path: database/tmp_s3
diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py
index bb0bdead62b2..9241c1efe75c 100644
--- a/lib/galaxy/objectstore/irods.py
+++ b/lib/galaxy/objectstore/irods.py
@@ -20,19 +20,12 @@
except ImportError:
irods = None
-from galaxy.exceptions import (
- ObjectInvalid,
- ObjectNotFound,
-)
from galaxy.util import (
- directory_hash_id,
ExecutionTimer,
string_as_bool,
- umask_fix_perms,
unlink,
)
-from galaxy.util.path import safe_relpath
-from . import DiskObjectStore
+from ._caching_base import CachingConcreteObjectStore
IRODS_IMPORT_MESSAGE = "The Python irods package is required to use this feature, please install it"
# 1 MB
@@ -115,7 +108,7 @@ def parse_config_xml(config_xml):
"cache_updated_data": cache_updated_data,
},
"extra_dirs": extra_dirs,
- "private": DiskObjectStore.parse_private_from_config_xml(config_xml),
+ "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml),
}
except Exception:
# Toss it back up after logging, we can't continue loading at this point.
@@ -123,35 +116,7 @@ def parse_config_xml(config_xml):
raise
-class CloudConfigMixin:
- def _config_to_dict(self):
- return {
- "auth": {
- "username": self.username,
- "password": self.password,
- },
- "resource": {
- "name": self.resource,
- },
- "zone": {
- "name": self.zone,
- },
- "connection": {
- "host": self.host,
- "port": self.port,
- "timeout": self.timeout,
- "refresh_time": self.refresh_time,
- "connection_pool_monitor_interval": self.connection_pool_monitor_interval,
- },
- "cache": {
- "size": self.cache_size,
- "path": self.staging_path,
- "cache_updated_data": self.cache_updated_data,
- },
- }
-
-
-class IRODSObjectStore(DiskObjectStore, CloudConfigMixin):
+class IRODSObjectStore(CachingConcreteObjectStore):
"""
Object store that stores files as data objects in an iRODS Zone. A local cache
exists that is used as an intermediate location for files between Galaxy and iRODS.
@@ -314,73 +279,34 @@ def to_dict(self):
as_dict.update(self._config_to_dict())
return as_dict
- def _fix_permissions(self, rel_path):
- """Set permissions on rel_path"""
- for basedir, _, files in os.walk(rel_path):
- umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
- for filename in files:
- path = os.path.join(basedir, filename)
- # Ignore symlinks
- if os.path.islink(path):
- continue
- umask_fix_perms(path, self.config.umask, 0o666, self.config.gid)
-
- def _construct_path(
- self,
- obj,
- base_dir=None,
- dir_only=None,
- extra_dir=None,
- extra_dir_at_root=False,
- alt_name=None,
- obj_dir=False,
- in_cache=False,
- **kwargs,
- ):
- ipt_timer = ExecutionTimer()
- # extra_dir should never be constructed from provided data but just
- # make sure there are no shenanigans afoot
- if extra_dir and extra_dir != os.path.normpath(extra_dir):
- log.warning("extra_dir is not normalized: %s", extra_dir)
- raise ObjectInvalid("The requested object is invalid")
- # ensure that any parent directory references in alt_name would not
- # result in a path not contained in the directory path constructed here
- if alt_name:
- if not safe_relpath(alt_name):
- log.warning("alt_name would locate path outside dir: %s", alt_name)
- raise ObjectInvalid("The requested object is invalid")
- # alt_name can contain parent directory references, but S3 will not
- # follow them, so if they are valid we normalize them out
- alt_name = os.path.normpath(alt_name)
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # for JOB_WORK directory
- if obj_dir:
- rel_path = os.path.join(rel_path, str(self._get_object_id(obj)))
- if base_dir:
- base = self.extra_dirs.get(base_dir)
- log.debug("irods_pt _construct_path: %s", ipt_timer)
- return os.path.join(base, rel_path)
-
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
- log.debug("irods_pt _construct_path: %s", ipt_timer)
-
- if in_cache:
- return self._get_cache_path(rel_path)
-
- return rel_path
-
- def _get_cache_path(self, rel_path):
- return os.path.abspath(os.path.join(self.staging_path, rel_path))
+ def _config_to_dict(self):
+ return {
+ "auth": {
+ "username": self.username,
+ "password": self.password,
+ },
+ "resource": {
+ "name": self.resource,
+ },
+ "zone": {
+ "name": self.zone,
+ },
+ "connection": {
+ "host": self.host,
+ "port": self.port,
+ "timeout": self.timeout,
+ "refresh_time": self.refresh_time,
+ "connection_pool_monitor_interval": self.connection_pool_monitor_interval,
+ },
+ "cache": {
+ "size": self.cache_size,
+ "path": self.staging_path,
+ "cache_updated_data": self.cache_updated_data,
+ },
+ }
# rel_path is file or folder?
- def _get_size_in_irods(self, rel_path):
+ def _get_remote_size(self, rel_path):
ipt_timer = ExecutionTimer()
p = Path(rel_path)
data_object_name = p.stem + p.suffix
@@ -397,10 +323,10 @@ def _get_size_in_irods(self, rel_path):
log.warning("Collection or data object (%s) does not exist", data_object_path)
return -1
finally:
- log.debug("irods_pt _get_size_in_irods: %s", ipt_timer)
+ log.debug("irods_pt _get_remote_size: %s", ipt_timer)
# rel_path is file or folder?
- def _data_object_exists(self, rel_path):
+ def _exists_remotely(self, rel_path):
ipt_timer = ExecutionTimer()
p = Path(rel_path)
data_object_name = p.stem + p.suffix
@@ -417,28 +343,12 @@ def _data_object_exists(self, rel_path):
log.debug("Collection or data object (%s) does not exist", data_object_path)
return False
finally:
- log.debug("irods_pt _data_object_exists: %s", ipt_timer)
-
- def _in_cache(self, rel_path):
- """Check if the given dataset is in the local cache and return True if so."""
- cache_path = self._get_cache_path(rel_path)
- return os.path.exists(cache_path)
-
- def _pull_into_cache(self, rel_path):
- ipt_timer = ExecutionTimer()
- # Ensure the cache directory structure exists (e.g., dataset_#_files/)
- rel_path_dir = os.path.dirname(rel_path)
- if not os.path.exists(self._get_cache_path(rel_path_dir)):
- os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True)
- # Now pull in the file
- file_ok = self._download(rel_path)
- self._fix_permissions(self._get_cache_path(rel_path_dir))
- log.debug("irods_pt _pull_into_cache: %s", ipt_timer)
- return file_ok
+ log.debug("irods_pt _exists_remotely: %s", ipt_timer)
def _download(self, rel_path):
ipt_timer = ExecutionTimer()
- log.debug("Pulling data object '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
+ cache_path = self._get_cache_path(rel_path)
+ log.debug("Pulling data object '%s' into cache to %s", rel_path, cache_path)
p = Path(rel_path)
data_object_name = p.stem + p.suffix
@@ -452,7 +362,6 @@ def _download(self, rel_path):
options = {kw.FORCE_FLAG_KW: "", kw.DEST_RESC_NAME_KW: self.resource}
try:
- cache_path = self._get_cache_path(rel_path)
self.session.data_objects.get(data_object_path, cache_path, **options)
log.debug("Pulled data object '%s' into cache to %s", rel_path, cache_path)
return True
@@ -462,7 +371,7 @@ def _download(self, rel_path):
finally:
log.debug("irods_pt _download: %s", ipt_timer)
- def _push_to_irods(self, rel_path, source_file=None, from_string=None):
+ def _push_to_storage(self, rel_path, source_file=None, from_string=None):
"""
Push the file pointed to by ``rel_path`` to the iRODS. Extract folder name
from rel_path as iRODS collection name, and extract file name from rel_path
@@ -539,103 +448,7 @@ def _push_to_irods(self, rel_path, source_file=None, from_string=None):
)
return True
finally:
- log.debug("irods_pt _push_to_irods: %s", ipt_timer)
-
- def file_ready(self, obj, **kwargs):
- """
- A helper method that checks if a file corresponding to a dataset is
- ready and available to be used. Return ``True`` if so, ``False`` otherwise.
- """
- ipt_timer = ExecutionTimer()
- rel_path = self._construct_path(obj, **kwargs)
- # Make sure the size in cache is available in its entirety
- if self._in_cache(rel_path):
- if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_irods(rel_path):
- log.debug("irods_pt _file_ready: %s", ipt_timer)
- return True
- log.debug(
- "Waiting for dataset %s to transfer from OS: %s/%s",
- rel_path,
- os.path.getsize(self._get_cache_path(rel_path)),
- self._get_size_in_irods(rel_path),
- )
- log.debug("irods_pt _file_ready: %s", ipt_timer)
- return False
-
- def _exists(self, obj, **kwargs):
- ipt_timer = ExecutionTimer()
- rel_path = self._construct_path(obj, **kwargs)
-
- # Check cache and irods
- if self._in_cache(rel_path) or self._data_object_exists(rel_path):
- log.debug("irods_pt _exists: %s", ipt_timer)
- return True
-
- # dir_only does not get synced so shortcut the decision
- dir_only = kwargs.get("dir_only", False)
- base_dir = kwargs.get("base_dir", None)
- if dir_only and base_dir:
- # for JOB_WORK directory
- if not os.path.exists(rel_path):
- os.makedirs(rel_path, exist_ok=True)
- log.debug("irods_pt _exists: %s", ipt_timer)
- return True
- log.debug("irods_pt _exists: %s", ipt_timer)
- return False
-
- def _create(self, obj, **kwargs):
- ipt_timer = ExecutionTimer()
- if not self._exists(obj, **kwargs):
- # Pull out locally used fields
- extra_dir = kwargs.get("extra_dir", None)
- extra_dir_at_root = kwargs.get("extra_dir_at_root", False)
- dir_only = kwargs.get("dir_only", False)
- alt_name = kwargs.get("alt_name", None)
-
- # Construct hashed path
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
-
- # Optionally append extra_dir
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # Create given directory in cache
- cache_dir = os.path.join(self.staging_path, rel_path)
- if not os.path.exists(cache_dir):
- os.makedirs(cache_dir, exist_ok=True)
-
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
- open(os.path.join(self.staging_path, rel_path), "w").close()
- self._push_to_irods(rel_path, from_string="")
- log.debug("irods_pt _create: %s", ipt_timer)
- return self
-
- def _empty(self, obj, **kwargs):
- if self._exists(obj, **kwargs):
- return bool(self._size(obj, **kwargs) > 0)
- else:
- raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}")
-
- def _size(self, obj, **kwargs) -> int:
- ipt_timer = ExecutionTimer()
- rel_path = self._construct_path(obj, **kwargs)
- if self._in_cache(rel_path):
- try:
- return os.path.getsize(self._get_cache_path(rel_path))
- except OSError as ex:
- log.info("Could not get size of file '%s' in local cache, will try iRODS. Error: %s", rel_path, ex)
- finally:
- log.debug("irods_pt _size: %s", ipt_timer)
- elif self._exists(obj, **kwargs):
- log.debug("irods_pt _size: %s", ipt_timer)
- return self._get_size_in_irods(rel_path)
- log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
- log.debug("irods_pt _size: %s", ipt_timer)
- return 0
+ log.debug("irods_pt _push_to_storage: %s", ipt_timer)
def _delete(self, obj, entire_dir=False, **kwargs):
ipt_timer = ExecutionTimer()
@@ -705,92 +518,6 @@ def _delete(self, obj, entire_dir=False, **kwargs):
log.debug("irods_pt _delete: %s", ipt_timer)
return False
- def _get_data(self, obj, start=0, count=-1, **kwargs):
- ipt_timer = ExecutionTimer()
- rel_path = self._construct_path(obj, **kwargs)
- # Check cache first and get file if not there
- if not self._in_cache(rel_path):
- self._pull_into_cache(rel_path)
- # Read the file content from cache
- data_file = open(self._get_cache_path(rel_path))
- data_file.seek(start)
- content = data_file.read(count)
- data_file.close()
- log.debug("irods_pt _get_data: %s", ipt_timer)
- return content
-
- def _get_filename(self, obj, **kwargs):
- ipt_timer = ExecutionTimer()
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
- rel_path = self._construct_path(obj, **kwargs)
- sync_cache = kwargs.get("sync_cache", True)
-
- # for JOB_WORK directory
- if base_dir and dir_only and obj_dir:
- log.debug("irods_pt _get_filename: %s", ipt_timer)
- return os.path.abspath(rel_path)
-
- cache_path = self._get_cache_path(rel_path)
- if not sync_cache:
- return cache_path
- # iRODS does not recognize directories as files so cannot check if those exist.
- # So, if checking dir only, ensure given dir exists in cache and return
- # the expected cache path.
- # dir_only = kwargs.get('dir_only', False)
- # if dir_only:
- # if not os.path.exists(cache_path):
- # os.makedirs(cache_path)
- # return cache_path
- # Check if the file exists in the cache first, always pull if file size in cache is zero
- if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0):
- log.debug("irods_pt _get_filename: %s", ipt_timer)
- return cache_path
- # Check if the file exists in persistent storage and, if it does, pull it into cache
- elif self._exists(obj, **kwargs):
- if dir_only: # Directories do not get pulled into cache
- log.debug("irods_pt _get_filename: %s", ipt_timer)
- return cache_path
- else:
- if self._pull_into_cache(rel_path):
- log.debug("irods_pt _get_filename: %s", ipt_timer)
- return cache_path
- # For the case of retrieving a directory only, return the expected path
- # even if it does not exist.
- # if dir_only:
- # return cache_path
- log.debug("irods_pt _get_filename: %s", ipt_timer)
- raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}")
- # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path
-
- def _update_from_file(self, obj, file_name=None, create=False, **kwargs):
- ipt_timer = ExecutionTimer()
- if create:
- self._create(obj, **kwargs)
- if self._exists(obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- # Choose whether to use the dataset file itself or an alternate file
- if file_name:
- source_file = os.path.abspath(file_name)
- # Copy into cache
- cache_file = self._get_cache_path(rel_path)
- try:
- if source_file != cache_file and self.cache_updated_data:
- # FIXME? Should this be a `move`?
- shutil.copy2(source_file, cache_file)
- self._fix_permissions(cache_file)
- except OSError:
- log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
- else:
- source_file = self._get_cache_path(rel_path)
- # Update the file on iRODS
- self._push_to_irods(rel_path, source_file)
- else:
- log.debug("irods_pt _update_from_file: %s", ipt_timer)
- raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}")
- log.debug("irods_pt _update_from_file: %s", ipt_timer)
-
# Unlike S3, url is not really applicable to iRODS
def _get_object_url(self, obj, **kwargs):
if self._exists(obj, **kwargs):
diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py
index 60a710f1542d..43697062d9d0 100644
--- a/lib/galaxy/objectstore/pithos.py
+++ b/lib/galaxy/objectstore/pithos.py
@@ -4,7 +4,6 @@
import logging
import os
-import shutil
try:
from kamaki.clients import (
@@ -17,16 +16,8 @@
except ImportError:
KamakiClient = None
-from galaxy.exceptions import (
- ObjectInvalid,
- ObjectNotFound,
-)
-from galaxy.util import (
- directory_hash_id,
- umask_fix_perms,
-)
-from galaxy.util.path import safe_relpath
-from . import ConcreteObjectStore
+from galaxy.util import directory_hash_id
+from ._caching_base import CachingConcreteObjectStore
NO_KAMAKI_ERROR_MESSAGE = (
"ObjectStore configured, but no kamaki.clients dependency available."
@@ -77,7 +68,7 @@ def parse_config_xml(config_xml):
log.error(msg)
raise Exception(msg)
r["extra_dirs"] = [{k: e.get(k) for k in attrs} for e in extra_dirs]
- r["private"] = ConcreteObjectStore.parse_private_from_config_xml(config_xml)
+ r["private"] = CachingConcreteObjectStore.parse_private_from_config_xml(config_xml)
if "job_work" not in (d["type"] for d in r["extra_dirs"]):
msg = f'No value for {tag}:type="job_work" in XML tree'
log.error(msg)
@@ -88,7 +79,7 @@ def parse_config_xml(config_xml):
return r
-class PithosObjectStore(ConcreteObjectStore):
+class PithosObjectStore(CachingConcreteObjectStore):
"""
Object store that stores objects as items in a Pithos+ container.
Cache is ignored for the time being.
@@ -101,7 +92,6 @@ def __init__(self, config, config_dict):
self.staging_path = self.config.file_path
log.info("Parse config_xml for pithos object store")
self.config_dict = config_dict
- log.debug(self.config_dict)
self._initialize()
@@ -109,6 +99,7 @@ def _initialize(self):
if KamakiClient is None:
raise Exception(NO_KAMAKI_ERROR_MESSAGE)
+ self._ensure_staging_path_writable()
log.info("Authenticate Synnefo account")
self._authenticate()
log.info("Initialize Pithos+ client")
@@ -152,91 +143,9 @@ def _init_pithos(self):
if project and c.get("x-container-policy-project") != project:
self.pithos.reassign_container(project)
- def _construct_path(
- self,
- obj,
- base_dir=None,
- dir_only=None,
- extra_dir=None,
- extra_dir_at_root=False,
- alt_name=None,
- obj_dir=False,
- in_cache=False,
- **kwargs,
- ):
- """Construct path from object and parameters"""
- # param extra_dir: should never be constructed from provided data but
- # just make sure there are no shenannigans afoot
- if extra_dir and extra_dir != os.path.normpath(extra_dir):
- log.warning(f"extra_dir is not normalized: {extra_dir}")
- raise ObjectInvalid("The requested object is invalid")
- # ensure that any parent directory references in alt_name would not
- # result in a path not contained in the directory path constructed here
- if alt_name:
- if not safe_relpath(alt_name):
- log.warning(f"alt_name would locate path outside dir: {alt_name}")
- raise ObjectInvalid("The requested object is invalid")
- # alt_name can contain parent directory references, but S3 will not
- # follow them, so if they are valid we normalize them out
- alt_name = os.path.normpath(alt_name)
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # for JOB_WORK directory
- if obj_dir:
- rel_path = os.path.join(rel_path, str(self._get_object_id(obj)))
- if base_dir:
- base = self.extra_dirs.get(base_dir)
- return os.path.join(base, rel_path)
-
- # Pithos+ folders are marked by having trailing '/' so add it now
- rel_path = f"{rel_path}/"
-
- if not dir_only:
- an = alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat"
- rel_path = os.path.join(rel_path, an)
-
- if in_cache:
- return self._get_cache_path(rel_path)
-
- return rel_path
-
- def _get_cache_path(self, rel_path):
- return os.path.abspath(os.path.join(self.staging_path, rel_path))
-
- def _in_cache(self, rel_path):
- """Check if the given dataset is in the local cache and return True if
- so.
- """
- cache_path = self._get_cache_path(rel_path)
- return os.path.exists(cache_path)
-
- def _fix_permissions(self, rel_path):
- """Set permissions on rel_path"""
- for basedir, _, files in os.walk(rel_path):
- umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
- for filename in files:
- path = os.path.join(basedir, filename)
- # Ignore symlinks
- if os.path.islink(path):
- continue
- umask_fix_perms(path, self.config.umask, 0o666, self.config.gid)
-
- def _pull_into_cache(self, rel_path):
- # Ensure the cache directory structure exists (e.g., dataset_#_files/)
- rel_path_dir = os.path.dirname(rel_path)
- rel_cache_path_dir = self._get_cache_path(rel_path_dir)
- if not os.path.exists(rel_cache_path_dir):
- os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True)
- # Now pull in the file
- cache_path = self._get_cache_path(rel_path_dir)
- self.pithos.download_object(rel_path, cache_path)
- self._fix_permissions(cache_path)
- return cache_path
+ def _download(self, rel_path):
+ local_destination = self._get_cache_path(rel_path)
+ self.pithos.download_object(rel_path, local_destination)
# No need to overwrite "shutdown"
@@ -305,27 +214,7 @@ def _create(self, obj, **kwargs):
self.pithos.upload_from_string(rel_path, "")
return self
- def _empty(self, obj, **kwargs):
- """
- :returns: weather the object has content
- :raises ObjectNotFound:
- """
- if not self._exists(obj, **kwargs):
- raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}")
- return bool(self._size(obj, **kwargs))
-
- def _size(self, obj, **kwargs) -> int:
- """
- :returns: The size of the object, or 0 if it doesn't exist (sorry for
- that, not our fault, the ObjectStore interface is like that some
- times)
- """
- path = self._construct_path(obj, **kwargs)
- if self._in_cache(path):
- try:
- return os.path.getsize(self._get_cache_path(path))
- except OSError as ex:
- log.warning("Could not get size of file %s in local cache, will try Pithos. Error: %s", path, ex)
+ def _get_remote_size(self, path):
try:
file = self.pithos.get_object_info(path)
except ClientError as ce:
@@ -334,96 +223,22 @@ def _size(self, obj, **kwargs) -> int:
return 0
return int(file["content-length"])
- def _delete(self, obj, **kwargs):
- """Delete the object
- :returns: weather the object was deleted
- """
- path = self._construct_path(obj, **kwargs)
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
+ def _delete_remote_all(self, path: str) -> bool:
try:
- if all((base_dir, dir_only, obj_dir)):
- shutil.rmtree(os.path.abspath(path))
- return True
- cache_path = self._get_cache_path(path)
-
- entire_dir = kwargs.get("entire_dir", False)
- extra_dir = kwargs.get("extra_dir", False)
- if entire_dir and extra_dir:
- shutil.rmtree(cache_path)
- log.debug(f"On Pithos: delete -r {path}/")
- self.pithos.del_object(path, delimiter="/")
- return True
- else:
- os.unlink(cache_path)
- self.pithos.del_object(path)
- except OSError:
- log.exception(f"{self._get_filename(obj, **kwargs)} delete error")
- except ClientError as ce:
- log.exception(f"Could not delete {path} from Pithos, {ce}")
- return False
-
- def _get_data(self, obj, start=0, count=-1, **kwargs):
- """Fetch (e.g., download) data
- :param start: Chunk of data starts here
- :param count: Fetch at most as many data, fetch all if negative
- """
- path = self._construct_path(obj, **kwargs)
- if self._in_cache(path):
- cache_path = self._pull_into_cache(path)
- else:
- cache_path = self._get_cache_path(path)
- data_file = open(cache_path)
- data_file.seek(start)
- content = data_file.read(count)
- data_file.close()
- return content
-
- def _get_filename(self, obj, **kwargs):
- """Get the expected filename with absolute path"""
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
- path = self._construct_path(obj, **kwargs)
-
- # for JOB_WORK directory
- if base_dir and dir_only and obj_dir:
- return os.path.abspath(path)
- cache_path = self._get_cache_path(path)
- if dir_only:
- if not os.path.exists(cache_path):
- os.makedirs(cache_path, exist_ok=True)
- return cache_path
- if self._in_cache(path):
- return cache_path
- elif self._exists(obj, **kwargs):
- if not dir_only:
- self._pull_into_cache(path)
- return cache_path
- raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}")
-
- def _update_from_file(self, obj, **kwargs):
- """Update the store when a file is updated"""
- if kwargs.get("create"):
- self._create(obj, **kwargs)
- if not self._exists(obj, **kwargs):
- raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}")
+ log.debug(f"On Pithos: delete -r {path}/")
+ self.pithos.del_object(path, delimiter="/")
+ return True
+ except ClientError:
+ log.exception(f"Could not delete path '{path}' from Pithos")
+ return False
- path = self._construct_path(obj, **kwargs)
- cache_path = self._get_cache_path(path)
- file_name = kwargs.get("file_name")
- if file_name:
- source_path = os.path.abspath(file_name)
- try:
- if source_path != cache_path:
- shutil.copy2(source_path, cache_path)
- self._fix_permissions(cache_path)
- except OSError:
- log.exception('Trouble copying source file "%s" to cache "%s"', source_path, cache_path)
- else:
- with open(cache_path) as f:
- self.pithos.upload_object(obj, f)
+ def _delete_existing_remote(self, path: str) -> bool:
+ try:
+ self.pithos.del_object(path)
+ return True
+ except ClientError:
+ log.exception(f"Could not delete path '{path}' from Pithos")
+ return False
def _get_object_url(self, obj, **kwargs):
"""
diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py
index 1d9c3d48b8d7..4bb6540a34de 100644
--- a/lib/galaxy/objectstore/rucio.py
+++ b/lib/galaxy/objectstore/rucio.py
@@ -2,7 +2,6 @@
import logging
import os
import shutil
-from typing import Optional
try:
import rucio.common
@@ -32,12 +31,9 @@
umask_fix_perms,
unlink,
)
-from galaxy.util.path import safe_relpath
-from . import ConcreteObjectStore
+from ._caching_base import CachingConcreteObjectStore
from .caching import (
- CacheTarget,
enable_cache_monitor,
- InProcessCacheMonitor,
parse_caching_config_dict_from_xml,
)
@@ -273,7 +269,7 @@ def delete(self, key, auth_token):
return True
-class RucioObjectStore(ConcreteObjectStore):
+class RucioObjectStore(CachingConcreteObjectStore):
"""
Object store implementation that uses ORNL remote data broker.
@@ -281,8 +277,6 @@ class RucioObjectStore(ConcreteObjectStore):
Galaxy at some future point or significantly modified.
"""
- cache_monitor: Optional[InProcessCacheMonitor] = None
-
store_type = "rucio"
def to_dict(self):
@@ -309,59 +303,8 @@ def __init__(self, config, config_dict):
self._initialize()
def _initialize(self):
- if self.enable_cache_monitor:
- self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval)
-
- def _in_cache(self, rel_path):
- """Check if the given dataset is in the local cache and return True if so."""
- cache_path = self._get_cache_path(rel_path)
- return os.path.exists(cache_path)
-
- def _construct_path(
- self,
- obj,
- base_dir=None,
- dir_only=None,
- extra_dir=None,
- extra_dir_at_root=False,
- alt_name=None,
- obj_dir=False,
- **kwargs,
- ):
- # extra_dir should never be constructed from provided data but just
- # make sure there are no shenanigans afoot
- if extra_dir and extra_dir != os.path.normpath(extra_dir):
- log.warning("extra_dir is not normalized: %s", extra_dir)
- raise ObjectInvalid("The requested object is invalid")
- # ensure that any parent directory references in alt_name would not
- # result in a path not contained in the directory path constructed here
- if alt_name:
- if not safe_relpath(alt_name):
- log.warning("alt_name would locate path outside dir: %s", alt_name)
- raise ObjectInvalid("The requested object is invalid")
- # alt_name can contain parent directory references, but S3 will not
- # follow them, so if they are valid we normalize them out
- alt_name = os.path.normpath(alt_name)
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # for JOB_WORK directory
- if obj_dir:
- rel_path = os.path.join(rel_path, str(self._get_object_id(obj)))
- if base_dir:
- base = self.extra_dirs.get(base_dir)
- return os.path.join(str(base), rel_path)
-
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
- return rel_path
-
- def _get_cache_path(self, rel_path):
- return os.path.abspath(os.path.join(self.staging_path, rel_path))
+ self._ensure_staging_path_writable()
+ self._start_cache_monitor_if_needed()
def _pull_into_cache(self, rel_path, auth_token):
log.debug("rucio _pull_into_cache: %s", rel_path)
@@ -414,25 +357,6 @@ def _exists(self, obj, **kwargs):
def parse_xml(cls, config_xml):
return parse_config_xml(config_xml)
- def file_ready(self, obj, **kwargs):
- log.debug("rucio file_ready")
- """
- A helper method that checks if a file corresponding to a dataset is
- ready and available to be used. Return ``True`` if so, ``False`` otherwise.
- """
- rel_path = self._construct_path(obj, **kwargs)
- # Make sure the size in cache is available in its entirety
- if self._in_cache(rel_path):
- if os.path.getsize(self._get_cache_path(rel_path)) == self.rucio_broker.get_size(rel_path):
- return True
- log.debug(
- "Waiting for dataset %s to transfer from OS: %s/%s",
- rel_path,
- os.path.getsize(self._get_cache_path(rel_path)),
- self.rucio_broker.get_size(rel_path),
- )
- return False
-
def _create(self, obj, **kwargs):
if not self._exists(obj, **kwargs):
# Pull out locally used fields
@@ -463,13 +387,6 @@ def _create(self, obj, **kwargs):
log.debug("rucio _create: %s", rel_path)
return self
- def _empty(self, obj, **kwargs):
- log.debug("rucio _empty")
- if self._exists(obj, **kwargs):
- return bool(self._size(obj, **kwargs) > 0)
- else:
- raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}")
-
def _size(self, obj, **kwargs):
rel_path = self._construct_path(obj, **kwargs)
log.debug("rucio _size: %s", rel_path)
@@ -482,10 +399,13 @@ def _size(self, obj, **kwargs):
if size != 0:
return size
if self._exists(obj, **kwargs):
- return self.rucio_broker.get_size(rel_path)
+ return self._get_remote_size(rel_path)
log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
return 0
+ def _get_remote_size(self, rel_path):
+ return self.rucio_broker.get_size(rel_path)
+
def _delete(self, obj, entire_dir=False, **kwargs):
rel_path = self._construct_path(obj, **kwargs)
extra_dir = kwargs.get("extra_dir", None)
@@ -515,20 +435,6 @@ def _delete(self, obj, entire_dir=False, **kwargs):
log.exception("%s delete error", self._get_filename(obj, **kwargs))
return False
- def _get_data(self, obj, start=0, count=-1, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- log.debug("rucio _get_data: %s", rel_path)
- auth_token = self._get_token(**kwargs)
- # Check cache first and get file if not there
- if not self._in_cache(rel_path) or os.path.getsize(self._get_cache_path(rel_path)) == 0:
- self._pull_into_cache(rel_path, auth_token)
- # Read the file content from cache
- data_file = open(self._get_cache_path(rel_path))
- data_file.seek(start)
- content = data_file.read(count)
- data_file.close()
- return content
-
def _get_token(self, **kwargs):
auth_token = kwargs.get("auth_token", None)
if auth_token:
@@ -649,13 +555,5 @@ def __build_kwargs(self, obj, **kwargs):
kwargs["object_id"] = obj.id
return kwargs
- @property
- def cache_target(self) -> CacheTarget:
- return CacheTarget(
- self.staging_path,
- self.cache_size,
- 0.9,
- )
-
def shutdown(self):
- self.cache_monitor and self.cache_monitor.shutdown()
+ self._shutdown_cache_monitor()
diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py
index 1caf355aec68..8c040e7523b8 100644
--- a/lib/galaxy/objectstore/s3.py
+++ b/lib/galaxy/objectstore/s3.py
@@ -3,13 +3,9 @@
"""
import logging
-import multiprocessing
import os
-import shutil
-import subprocess
import time
from datetime import datetime
-from typing import Optional
try:
# Imports are done this way to allow objectstore code to be used outside of Galaxy.
@@ -20,23 +16,11 @@
except ImportError:
boto = None # type: ignore[assignment]
-from galaxy.exceptions import (
- ObjectInvalid,
- ObjectNotFound,
-)
-from galaxy.util import (
- directory_hash_id,
- string_as_bool,
- umask_fix_perms,
- unlink,
- which,
-)
-from galaxy.util.path import safe_relpath
-from . import ConcreteObjectStore
+from galaxy.util import string_as_bool
+from ._caching_base import CachingConcreteObjectStore
+from ._util import UsesAxel
from .caching import (
- CacheTarget,
enable_cache_monitor,
- InProcessCacheMonitor,
parse_caching_config_dict_from_xml,
)
from .s3_multipart_upload import multipart_upload
@@ -119,7 +103,7 @@ def parse_config_xml(config_xml):
},
"cache": cache_dict,
"extra_dirs": extra_dirs,
- "private": ConcreteObjectStore.parse_private_from_config_xml(config_xml),
+ "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml),
}
except Exception:
# Toss it back up after logging, we can't continue loading at this point.
@@ -154,14 +138,13 @@ def _config_to_dict(self):
}
-class S3ObjectStore(ConcreteObjectStore, CloudConfigMixin):
+class S3ObjectStore(CachingConcreteObjectStore, CloudConfigMixin, UsesAxel):
"""
Object store that stores objects as items in an AWS S3 bucket. A local
cache exists that is used as an intermediate location for files between
Galaxy and S3.
"""
- cache_monitor: Optional[InProcessCacheMonitor] = None
store_type = "aws_s3"
def __init__(self, config, config_dict):
@@ -215,18 +198,11 @@ def _initialize(self):
"conn_path": self.conn_path,
}
+ self._ensure_staging_path_writable()
self._configure_connection()
self._bucket = self._get_bucket(self.bucket)
- self.start_cache_monitor()
- # Test if 'axel' is available for parallel download and pull the key into cache
- if which("axel"):
- self.use_axel = True
- else:
- self.use_axel = False
-
- def start_cache_monitor(self):
- if self.enable_cache_monitor:
- self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval)
+ self._start_cache_monitor_if_needed()
+ self._init_axel()
def _configure_connection(self):
log.debug("Configuring S3 Connection")
@@ -261,14 +237,6 @@ def to_dict(self):
as_dict.update(self._config_to_dict())
return as_dict
- @property
- def cache_target(self) -> CacheTarget:
- return CacheTarget(
- self.staging_path,
- self.cache_size,
- 0.9,
- )
-
def _get_bucket(self, bucket_name):
"""Sometimes a handle to a bucket is not established right away so try
it a few times. Raise error is connection is not established."""
@@ -288,73 +256,10 @@ def _get_bucket(self, bucket_name):
# raise error
raise S3ResponseError
- def _fix_permissions(self, rel_path):
- """Set permissions on rel_path"""
- for basedir, _, files in os.walk(rel_path):
- umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
- for filename in files:
- path = os.path.join(basedir, filename)
- # Ignore symlinks
- if os.path.islink(path):
- continue
- umask_fix_perms(path, self.config.umask, 0o666, self.config.gid)
-
- def _construct_path(
- self,
- obj,
- base_dir=None,
- dir_only=None,
- extra_dir=None,
- extra_dir_at_root=False,
- alt_name=None,
- obj_dir=False,
- in_cache=False,
- **kwargs,
- ):
- # extra_dir should never be constructed from provided data but just
- # make sure there are no shenannigans afoot
- if extra_dir and extra_dir != os.path.normpath(extra_dir):
- log.warning("extra_dir is not normalized: %s", extra_dir)
- raise ObjectInvalid("The requested object is invalid")
- # ensure that any parent directory references in alt_name would not
- # result in a path not contained in the directory path constructed here
- if alt_name:
- if not safe_relpath(alt_name):
- log.warning("alt_name would locate path outside dir: %s", alt_name)
- raise ObjectInvalid("The requested object is invalid")
- # alt_name can contain parent directory references, but S3 will not
- # follow them, so if they are valid we normalize them out
- alt_name = os.path.normpath(alt_name)
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # for JOB_WORK directory
- if obj_dir:
- rel_path = os.path.join(rel_path, str(self._get_object_id(obj)))
- if base_dir:
- base = self.extra_dirs.get(base_dir)
- return os.path.join(base, rel_path)
-
- # S3 folders are marked by having trailing '/' so add it now
- rel_path = f"{rel_path}/"
-
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
- if in_cache:
- return self._get_cache_path(rel_path)
- return rel_path
-
- def _get_cache_path(self, rel_path):
- return os.path.abspath(os.path.join(self.staging_path, rel_path))
-
def _get_transfer_progress(self):
return self.transfer_progress
- def _get_size_in_s3(self, rel_path):
+ def _get_remote_size(self, rel_path):
try:
key = self._bucket.get_key(rel_path)
return key.size
@@ -362,7 +267,7 @@ def _get_size_in_s3(self, rel_path):
log.exception("Could not get size of key '%s' from S3", rel_path)
return -1
- def _key_exists(self, rel_path):
+ def _exists_remotely(self, rel_path):
exists = False
try:
# A hackish way of testing if the rel_path is a folder vs a file
@@ -381,82 +286,35 @@ def _key_exists(self, rel_path):
return False
return exists
- def _in_cache(self, rel_path):
- """Check if the given dataset is in the local cache and return True if so."""
- # log.debug("------ Checking cache for rel_path %s" % rel_path)
- cache_path = self._get_cache_path(rel_path)
- return os.path.exists(cache_path)
- # TODO: Part of checking if a file is in cache should be to ensure the
- # size of the cached file matches that on S3. Once the upload tool explicitly
- # creates, this check sould be implemented- in the mean time, it's not
- # looking likely to be implementable reliably.
- # if os.path.exists(cache_path):
- # # print("***1 %s exists" % cache_path)
- # if self._key_exists(rel_path):
- # # print("***2 %s exists in S3" % rel_path)
- # # Make sure the size in cache is available in its entirety
- # # print("File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path)))
- # if os.path.getsize(cache_path) == self._get_size_in_s3(rel_path):
- # # print("***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path)
- # exists = True
- # else:
- # # print("***2.2 %s exists but differs in size from cache (in_cache=False)" % cache_path)
- # exists = False
- # else:
- # # Although not perfect decision making, this most likely means
- # # that the file is currently being uploaded
- # # print("***3 %s found in cache but not in S3 (in_cache=True)" % cache_path)
- # exists = True
- # else:
- # return False
-
- def _pull_into_cache(self, rel_path):
- # Ensure the cache directory structure exists (e.g., dataset_#_files/)
- rel_path_dir = os.path.dirname(rel_path)
- if not os.path.exists(self._get_cache_path(rel_path_dir)):
- os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True)
- # Now pull in the file
- file_ok = self._download(rel_path)
- self._fix_permissions(self._get_cache_path(rel_path_dir))
- return file_ok
-
def _transfer_cb(self, complete, total):
self.transfer_progress += 10
def _download(self, rel_path):
+ local_destination = self._get_cache_path(rel_path)
try:
- log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
+ log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination)
key = self._bucket.get_key(rel_path)
if key is None:
message = f"Attempting to download an invalid key for path {rel_path}."
log.critical(message)
raise Exception(message)
- # Test if cache is large enough to hold the new file
- if not self.cache_target.fits_in_cache(key.size):
- log.critical(
- "File %s is larger (%s) than the configured cache allows (%s). Cannot download.",
- rel_path,
- key.size,
- self.cache_target.log_description,
- )
+ remote_size = key.size
+ if not self._caching_allowed(rel_path, remote_size):
return False
if self.use_axel:
- log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
- ncores = multiprocessing.cpu_count()
+ log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination)
url = key.generate_url(7200)
- ret_code = subprocess.call(["axel", "-a", "-n", str(ncores), url])
- if ret_code == 0:
- return True
+ return self._axel_download(url, local_destination)
else:
- log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
+ log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination)
self.transfer_progress = 0 # Reset transfer progress counter
- key.get_contents_to_filename(self._get_cache_path(rel_path), cb=self._transfer_cb, num_cb=10)
+ key.get_contents_to_filename(local_destination, cb=self._transfer_cb, num_cb=10)
return True
except S3ResponseError:
log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self._bucket.name)
return False
- def _push_to_os(self, rel_path, source_file=None, from_string=None):
+ def _push_to_storage(self, rel_path, source_file=None, from_string=None):
"""
Push the file pointed to by ``rel_path`` to the object store naming the key
``rel_path``. If ``source_file`` is provided, push that file instead while
@@ -512,225 +370,29 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None):
raise
return False
- def file_ready(self, obj, **kwargs):
- """
- A helper method that checks if a file corresponding to a dataset is
- ready and available to be used. Return ``True`` if so, ``False`` otherwise.
- """
- rel_path = self._construct_path(obj, **kwargs)
- # Make sure the size in cache is available in its entirety
- if self._in_cache(rel_path):
- if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_s3(rel_path):
- return True
- log.debug(
- "Waiting for dataset %s to transfer from OS: %s/%s",
- rel_path,
- os.path.getsize(self._get_cache_path(rel_path)),
- self._get_size_in_s3(rel_path),
- )
- return False
-
- def _exists(self, obj, **kwargs):
- in_cache = in_s3 = False
- rel_path = self._construct_path(obj, **kwargs)
- dir_only = kwargs.get("dir_only", False)
- base_dir = kwargs.get("base_dir", None)
-
- # check job work directory stuff early to skip API hits.
- if dir_only and base_dir:
- if not os.path.exists(rel_path):
- os.makedirs(rel_path, exist_ok=True)
- return True
-
- # Check cache
- if self._in_cache(rel_path):
- in_cache = True
- # Check S3
- in_s3 = self._key_exists(rel_path)
- # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3))
- # dir_only does not get synced so shortcut the decision
- if dir_only:
- if in_cache or in_s3:
- return True
- else:
- return False
-
- # TODO: Sync should probably not be done here. Add this to an async upload stack?
- if in_cache and not in_s3:
- self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path))
- return True
- elif in_s3:
+ def _delete_remote_all(self, rel_path: str) -> bool:
+ try:
+ results = self._bucket.get_all_keys(prefix=rel_path)
+ for key in results:
+ log.debug("Deleting key %s", key.name)
+ key.delete()
return True
- else:
+ except S3ResponseError:
+ log.exception("Could not delete blob '%s' from S3", rel_path)
return False
- def _create(self, obj, **kwargs):
- if not self._exists(obj, **kwargs):
- # Pull out locally used fields
- extra_dir = kwargs.get("extra_dir", None)
- extra_dir_at_root = kwargs.get("extra_dir_at_root", False)
- dir_only = kwargs.get("dir_only", False)
- alt_name = kwargs.get("alt_name", None)
-
- # Construct hashed path
- rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj)))
-
- # Optionally append extra_dir
- if extra_dir is not None:
- if extra_dir_at_root:
- rel_path = os.path.join(extra_dir, rel_path)
- else:
- rel_path = os.path.join(rel_path, extra_dir)
-
- # Create given directory in cache
- cache_dir = os.path.join(self.staging_path, rel_path)
- if not os.path.exists(cache_dir):
- os.makedirs(cache_dir, exist_ok=True)
-
- # Although not really necessary to create S3 folders (because S3 has
- # flat namespace), do so for consistency with the regular file system
- # S3 folders are marked by having trailing '/' so add it now
- # s3_dir = '%s/' % rel_path
- # self._push_to_os(s3_dir, from_string='')
- # If instructed, create the dataset in cache & in S3
- if not dir_only:
- rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat")
- open(os.path.join(self.staging_path, rel_path), "w").close()
- self._push_to_os(rel_path, from_string="")
- return self
-
- def _empty(self, obj, **kwargs):
- if self._exists(obj, **kwargs):
- return bool(self._size(obj, **kwargs) == 0)
- else:
- raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}")
-
- def _size(self, obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- if self._in_cache(rel_path):
- try:
- return os.path.getsize(self._get_cache_path(rel_path))
- except OSError as ex:
- log.info("Could not get size of file '%s' in local cache, will try S3. Error: %s", rel_path, ex)
- elif self._exists(obj, **kwargs):
- return self._get_size_in_s3(rel_path)
- log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
- return 0
-
- def _delete(self, obj, entire_dir=False, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- extra_dir = kwargs.get("extra_dir", None)
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
+ def _delete_existing_remote(self, rel_path: str) -> bool:
try:
- # Remove temparory data in JOB_WORK directory
- if base_dir and dir_only and obj_dir:
- shutil.rmtree(os.path.abspath(rel_path))
- return True
-
- # For the case of extra_files, because we don't have a reference to
- # individual files/keys we need to remove the entire directory structure
- # with all the files in it. This is easy for the local file system,
- # but requires iterating through each individual key in S3 and deleing it.
- if entire_dir and extra_dir:
- shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True)
- results = self._bucket.get_all_keys(prefix=rel_path)
- for key in results:
- log.debug("Deleting key %s", key.name)
- key.delete()
- return True
- else:
- # Delete from cache first
- unlink(self._get_cache_path(rel_path), ignore_errors=True)
- # Delete from S3 as well
- if self._key_exists(rel_path):
- key = Key(self._bucket, rel_path)
- log.debug("Deleting key %s", key.name)
- key.delete()
- return True
+ key = Key(self._bucket, rel_path)
+ log.debug("Deleting key %s", key.name)
+ key.delete()
+ return True
except S3ResponseError:
- log.exception("Could not delete key '%s' from S3", rel_path)
- except OSError:
- log.exception("%s delete error", self._get_filename(obj, **kwargs))
- return False
+ log.exception("Could not delete blob '%s' from S3", rel_path)
+ return False
- def _get_data(self, obj, start=0, count=-1, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- # Check cache first and get file if not there
- if not self._in_cache(rel_path) or os.path.getsize(self._get_cache_path(rel_path)) == 0:
- self._pull_into_cache(rel_path)
- # Read the file content from cache
- data_file = open(self._get_cache_path(rel_path))
- data_file.seek(start)
- content = data_file.read(count)
- data_file.close()
- return content
-
- def _get_filename(self, obj, **kwargs):
- base_dir = kwargs.get("base_dir", None)
- dir_only = kwargs.get("dir_only", False)
- obj_dir = kwargs.get("obj_dir", False)
- sync_cache = kwargs.get("sync_cache", True)
-
- rel_path = self._construct_path(obj, **kwargs)
-
- # for JOB_WORK directory
- if base_dir and dir_only and obj_dir:
- return os.path.abspath(rel_path)
-
- cache_path = self._get_cache_path(rel_path)
- if not sync_cache:
- return cache_path
- # S3 does not recognize directories as files so cannot check if those exist.
- # So, if checking dir only, ensure given dir exists in cache and return
- # the expected cache path.
- # dir_only = kwargs.get('dir_only', False)
- # if dir_only:
- # if not os.path.exists(cache_path):
- # os.makedirs(cache_path)
- # return cache_path
- # Check if the file exists in the cache first, always pull if file size in cache is zero
- if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0):
- return cache_path
- # Check if the file exists in persistent storage and, if it does, pull it into cache
- elif self._exists(obj, **kwargs):
- if dir_only:
- download_directory(self._bucket, rel_path, cache_path)
- return cache_path
- else:
- if self._pull_into_cache(rel_path):
- return cache_path
- # For the case of retrieving a directory only, return the expected path
- # even if it does not exist.
- # if dir_only:
- # return cache_path
- raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}")
- # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path
-
- def _update_from_file(self, obj, file_name=None, create=False, **kwargs):
- if create:
- self._create(obj, **kwargs)
- if self._exists(obj, **kwargs):
- rel_path = self._construct_path(obj, **kwargs)
- # Chose whether to use the dataset file itself or an alternate file
- if file_name:
- source_file = os.path.abspath(file_name)
- # Copy into cache
- cache_file = self._get_cache_path(rel_path)
- try:
- if source_file != cache_file and self.cache_updated_data:
- # FIXME? Should this be a `move`?
- shutil.copy2(source_file, cache_file)
- self._fix_permissions(cache_file)
- except OSError:
- log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
- else:
- source_file = self._get_cache_path(rel_path)
- # Update the file on S3
- self._push_to_os(rel_path, source_file)
- else:
- raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}")
+ def _download_directory_into_cache(self, rel_path, cache_path):
+ download_directory(self._bucket, rel_path, cache_path)
def _get_object_url(self, obj, **kwargs):
if self._exists(obj, **kwargs):
@@ -746,7 +408,7 @@ def _get_store_usage_percent(self, obj):
return 0.0
def shutdown(self):
- self.cache_monitor and self.cache_monitor.shutdown()
+ self._shutdown_cache_monitor()
class GenericS3ObjectStore(S3ObjectStore):
diff --git a/lib/galaxy/objectstore/s3_boto3.py b/lib/galaxy/objectstore/s3_boto3.py
new file mode 100644
index 000000000000..81dd41ac97a5
--- /dev/null
+++ b/lib/galaxy/objectstore/s3_boto3.py
@@ -0,0 +1,424 @@
+"""A more modern version of the S3 object store based on boto3 instead of boto.
+"""
+
+import logging
+import os
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ TYPE_CHECKING,
+)
+
+from typing_extensions import (
+ Literal,
+ NotRequired,
+ TypedDict,
+)
+
+if TYPE_CHECKING:
+ from mypy_boto3_c3.client import S3Client
+
+try:
+ # Imports are done this way to allow objectstore code to be used outside of Galaxy.
+ import boto3
+ from boto3.s3.transfer import TransferConfig
+ from botocore.client import ClientError
+except ImportError:
+ boto3 = None # type: ignore[assignment,unused-ignore]
+ TransferConfig = None # type: ignore[assignment,unused-ignore,misc]
+
+from galaxy.util import asbool
+from ._caching_base import CachingConcreteObjectStore
+from .caching import (
+ enable_cache_monitor,
+ parse_caching_config_dict_from_xml,
+)
+
+NO_BOTO_ERROR_MESSAGE = (
+ "S3/Swift object store configured, but no boto3 dependency available."
+ "Please install and properly configure boto or modify object store configuration."
+)
+
+log = logging.getLogger(__name__)
+# This object store generates a lot of logging by default, fairly sure it is an anti-pattern
+# to just disable library logging.
+# logging.getLogger("botocore").setLevel(logging.INFO)
+# logging.getLogger("s3transfer").setLevel(logging.INFO)
+
+
+def host_to_endpoint(mapping):
+ # convert older-style boto parameters to boto3 endpoint_url.
+ host = mapping["host"]
+ port = mapping.get("port", 6000)
+ is_secure = asbool(mapping.get("is_secure", "True"))
+ conn_path = mapping.get("conn_path", "/")
+ scheme = "https" if is_secure else "http"
+ return f"{scheme}://{host}:{port}{conn_path}"
+
+
+def parse_config_xml(config_xml):
+ try:
+ a_xml = config_xml.findall("auth")[0]
+ access_key = a_xml.get("access_key")
+ secret_key = a_xml.get("secret_key")
+
+ b_xml = config_xml.findall("bucket")[0]
+ bucket_name = b_xml.get("name")
+
+ cn_xml = config_xml.findall("connection")
+ if not cn_xml:
+ cn_xml = {}
+ else:
+ cn_xml = cn_xml[0]
+ endpoint_url = cn_xml.get("endpoint_url")
+
+ # for admin ease - allow older style host, port, is_secure, conn_path to be used.
+ if endpoint_url is None and cn_xml.get("host") is not None:
+ endpoint_url = host_to_endpoint(cn_xml)
+ region = cn_xml.get("region")
+ cache_dict = parse_caching_config_dict_from_xml(config_xml)
+
+ transfer_xml = config_xml.findall("transfer")
+ if not transfer_xml:
+ transfer_xml = {}
+ else:
+ transfer_xml = transfer_xml[0]
+ transfer_dict = {}
+ for prefix in ["", "upload_", "download_"]:
+ for key in [
+ "multipart_threshold",
+ "max_concurrency",
+ "multipart_chunksize",
+ "num_download_attempts",
+ "max_io_queue",
+ "io_chunksize",
+ "use_threads",
+ "max_bandwidth",
+ ]:
+ full_key = f"{prefix}{key}"
+ value = transfer_xml.get(full_key)
+ if transfer_xml.get(full_key) is not None:
+ transfer_dict[full_key] = value
+
+ tag, attrs = "extra_dir", ("type", "path")
+ extra_dirs = config_xml.findall(tag)
+ if not extra_dirs:
+ msg = f"No {tag} element in XML tree"
+ log.error(msg)
+ raise Exception(msg)
+ extra_dirs = [{k: e.get(k) for k in attrs} for e in extra_dirs]
+
+ config_dict = {
+ "auth": {
+ "access_key": access_key,
+ "secret_key": secret_key,
+ },
+ "bucket": {
+ "name": bucket_name,
+ },
+ "connection": {
+ "endpoint_url": endpoint_url,
+ "region": region,
+ },
+ "transfer": transfer_dict,
+ "cache": cache_dict,
+ "extra_dirs": extra_dirs,
+ "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml),
+ }
+ name = config_xml.attrib.get("name", None)
+ if name is not None:
+ config_dict["name"] = name
+ device = config_xml.attrib.get("device", None)
+ config_dict["device"] = device
+ return config_dict
+ except Exception:
+ # Toss it back up after logging, we can't continue loading at this point.
+ log.exception("Malformed ObjectStore Configuration XML -- unable to continue")
+ raise
+
+
+class S3ClientConstructorKwds(TypedDict):
+ service_name: Literal["s3"]
+ endpoint_url: NotRequired[str]
+ region_name: NotRequired[str]
+ aws_access_key_id: NotRequired[str]
+ aws_secret_access_key: NotRequired[str]
+
+
+class S3ObjectStore(CachingConcreteObjectStore):
+ """
+ Object store that stores objects as items in an AWS S3 bucket. A local
+ cache exists that is used as an intermediate location for files between
+ Galaxy and S3.
+ """
+
+ _client: "S3Client"
+ store_type = "boto3"
+ cloud = True
+
+ def __init__(self, config, config_dict):
+ super().__init__(config, config_dict)
+ self.cache_monitor = None
+
+ auth_dict = config_dict["auth"]
+ bucket_dict = config_dict["bucket"]
+ connection_dict = config_dict.get("connection", {})
+ cache_dict = config_dict.get("cache") or {}
+ transfer_dict = config_dict.get("transfer", {})
+ typed_transfer_dict = {}
+ for prefix in ["", "upload_", "download_"]:
+ options: Dict[str, Callable[[Any], Any]] = {
+ "multipart_threshold": int,
+ "max_concurrency": int,
+ "multipart_chunksize": int,
+ "num_download_attempts": int,
+ "max_io_queue": int,
+ "io_chunksize": int,
+ "use_threads": asbool,
+ "max_bandwidth": int,
+ }
+ for key, key_type in options.items():
+ full_key = f"{prefix}{key}"
+ transfer_value = transfer_dict.get(full_key)
+ if transfer_value is not None:
+ typed_transfer_dict[full_key] = key_type(transfer_value)
+ self.transfer_dict = typed_transfer_dict
+
+ self.enable_cache_monitor, self.cache_monitor_interval = enable_cache_monitor(config, config_dict)
+
+ self.access_key = auth_dict.get("access_key")
+ self.secret_key = auth_dict.get("secret_key")
+
+ self.bucket = bucket_dict.get("name")
+
+ self.endpoint_url = connection_dict.get("endpoint_url")
+ if self.endpoint_url is None and "host" in connection_dict:
+ self.endpoint_url = host_to_endpoint(connection_dict)
+
+ self.region = connection_dict.get("region")
+
+ self.cache_size = cache_dict.get("size") or self.config.object_store_cache_size
+ self.staging_path = cache_dict.get("path") or self.config.object_store_cache_path
+ self.cache_updated_data = cache_dict.get("cache_updated_data", True)
+
+ extra_dirs = {e["type"]: e["path"] for e in config_dict.get("extra_dirs", [])}
+ self.extra_dirs.update(extra_dirs)
+
+ self._initialize()
+
+ def _initialize(self):
+ if boto3 is None:
+ raise Exception(NO_BOTO_ERROR_MESSAGE)
+
+ self._ensure_staging_path_writable()
+ self._configure_connection()
+ self._start_cache_monitor_if_needed()
+
+ def _configure_connection(self):
+ log.debug("Configuring S3 Connection")
+ self._init_client()
+ if not self._bucket_exists:
+ self._create_bucket()
+
+ # get_object_url only works on AWS if client is set, so if it wasn't
+ # fetch it and reset the client now. Skip this logic entirely for other
+ # non-AWS services by ensuring endpoint_url is not set.
+ if not self.endpoint_url and not self.region:
+ response = self._client.get_bucket_location(
+ Bucket=self.bucket,
+ )
+ if "LocationConstraint" in response:
+ region = response["LocationConstraint"]
+ self.region = region
+ self._init_client()
+
+ def _init_client(self):
+ # set _client based on current args.
+ # If access_key is empty use default credential chain
+ kwds: S3ClientConstructorKwds = {
+ "service_name": "s3",
+ }
+ if self.endpoint_url:
+ kwds["endpoint_url"] = self.endpoint_url
+ if self.region:
+ kwds["region_name"] = self.region
+ if self.access_key:
+ kwds["aws_access_key_id"] = self.access_key
+ kwds["aws_secret_access_key"] = self.secret_key
+ self._client = boto3.client(**kwds)
+
+ @property
+ def _bucket_exists(self) -> bool:
+ try:
+ self._client.head_bucket(Bucket=self.bucket)
+ return True
+ except ClientError as err:
+ if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
+ return False
+ raise
+
+ def _create_bucket(self):
+ kwds = {}
+ if self.region:
+ kwds["CreateBucketConfiguration"] = dict(LocationConstraint=self.region)
+ self._client.create_bucket(Bucket=self.bucket, **kwds)
+
+ @classmethod
+ def parse_xml(clazz, config_xml):
+ return parse_config_xml(config_xml)
+
+ def _config_to_dict(self):
+ return {
+ "auth": {
+ "access_key": self.access_key,
+ "secret_key": self.secret_key,
+ },
+ "bucket": {
+ "name": self.bucket,
+ },
+ "connection": {
+ "endpoint_url": self.endpoint_url,
+ "region": self.region,
+ },
+ "transfer": self.transfer_dict,
+ "cache": {
+ "size": self.cache_size,
+ "path": self.staging_path,
+ "cache_updated_data": self.cache_updated_data,
+ },
+ }
+
+ def to_dict(self):
+ as_dict = super().to_dict()
+ as_dict.update(self._config_to_dict())
+ return as_dict
+
+ def _get_remote_size(self, rel_path) -> int:
+ response = self._client.head_object(Bucket=self.bucket, Key=rel_path)
+ return response["ContentLength"]
+
+ def _exists_remotely(self, rel_path: str) -> bool:
+ try:
+ is_dir = rel_path[-1] == "/"
+ if is_dir:
+ for _ in self._keys(rel_path):
+ return True
+
+ return False
+ else:
+ self._client.head_object(Bucket=self.bucket, Key=rel_path)
+ return True
+ except ClientError as e:
+ if e.response["Error"]["Code"] == "404":
+ return False
+ raise
+
+ def _download(self, rel_path: str) -> bool:
+ local_destination = self._get_cache_path(rel_path)
+ try:
+ log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination)
+ if not self._caching_allowed(rel_path):
+ return False
+ config = self._transfer_config("download")
+ self._client.download_file(self.bucket, rel_path, local_destination, Config=config)
+ return True
+ except ClientError:
+ log.exception("Failed to download file from S3")
+ return False
+
+ def _push_string_to_path(self, rel_path: str, from_string: str) -> bool:
+ try:
+ self._client.put_object(Body=from_string.encode("utf-8"), Bucket=self.bucket, Key=rel_path)
+ return True
+ except ClientError:
+ log.exception("Trouble pushing to S3 '%s' from string", rel_path)
+ return False
+
+ def _push_file_to_path(self, rel_path: str, source_file: str) -> bool:
+ try:
+ config = self._transfer_config("upload")
+ self._client.upload_file(source_file, self.bucket, rel_path, Config=config)
+ return True
+ except ClientError:
+ log.exception("Trouble pushing to S3 '%s' from file '%s'", rel_path, source_file)
+ return False
+
+ def _delete_remote_all(self, rel_path: str) -> bool:
+ try:
+ for key in self._keys(rel_path):
+ self._client.delete_object(Bucket=self.bucket, Key=key)
+ return True
+ except ClientError:
+ log.exception("Could not delete blob '%s' from S3", rel_path)
+ return False
+
+ def _delete_existing_remote(self, rel_path: str) -> bool:
+ try:
+ self._client.delete_object(Bucket=self.bucket, Key=rel_path)
+ return True
+ except ClientError:
+ log.exception("Could not delete blob '%s' from S3", rel_path)
+ return False
+
+ # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
+ def _keys(self, prefix="/", delimiter="/", start_after=""):
+ s3_paginator = self._client.get_paginator("list_objects_v2")
+ prefix = prefix.lstrip(delimiter)
+ start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
+ for page in s3_paginator.paginate(Bucket=self.bucket, Prefix=prefix, StartAfter=start_after):
+ for content in page.get("Contents", ()):
+ yield content["Key"]
+
+ def _download_directory_into_cache(self, rel_path, cache_path):
+ for key in self._keys(rel_path):
+ local_file_path = os.path.join(cache_path, os.path.relpath(key, rel_path))
+
+ # Create directories if they don't exist
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+
+ # Download the file
+ self._client.download_file(self.bucket, key, local_file_path)
+
+ def _get_object_url(self, obj, **kwargs):
+ try:
+ if self._exists(obj, **kwargs):
+ rel_path = self._construct_path(obj, **kwargs)
+ url = self._client.generate_presigned_url(
+ ClientMethod="get_object",
+ Params={
+ "Bucket": self.bucket,
+ "Key": rel_path,
+ },
+ ExpiresIn=3600,
+ HttpMethod="GET",
+ )
+ return url
+ except ClientError:
+ log.exception("Failed to generate URL for dataset.")
+ return None
+
+ def _get_store_usage_percent(self, obj):
+ return 0.0
+
+ def _transfer_config(self, prefix: Literal["upload", "download"]) -> "TransferConfig":
+ config = {}
+ for key in [
+ "multipart_threshold",
+ "max_concurrency",
+ "multipart_chunksize",
+ "num_download_attempts",
+ "max_io_queue",
+ "io_chunksize",
+ "use_threads",
+ "max_bandwidth",
+ ]:
+ specific_key = f"{prefix}_{key}"
+ if specific_key in self.transfer_dict:
+ config[key] = self.transfer_dict[specific_key]
+ elif key in self.transfer_dict:
+ config[key] = self.transfer_dict[key]
+ return TransferConfig(**config)
+
+ def shutdown(self):
+ self._shutdown_cache_monitor()
diff --git a/lib/galaxy/objectstore/unittest_utils/__init__.py b/lib/galaxy/objectstore/unittest_utils/__init__.py
index 8807159b2437..158400b7aea9 100644
--- a/lib/galaxy/objectstore/unittest_utils/__init__.py
+++ b/lib/galaxy/objectstore/unittest_utils/__init__.py
@@ -1,6 +1,7 @@
"""Utilities for configuring and using objectstores in unit tests."""
import os
+import random
from io import StringIO
from shutil import rmtree
from string import Template
@@ -32,10 +33,22 @@
class Config:
- def __init__(self, config_str=DISK_TEST_CONFIG, clazz=None, store_by="id", template_vars=None):
+ def __init__(
+ self,
+ config_str=DISK_TEST_CONFIG,
+ clazz=None,
+ store_by="id",
+ template_vars=None,
+ inject_galaxy_test_env=False,
+ ):
self.temp_directory = mkdtemp()
- template_vars = template_vars or {}
+ template_vars = {}
template_vars["temp_directory"] = self.temp_directory
+ if inject_galaxy_test_env:
+ template_vars["test_random_int"] = random.randint(100000, 999999)
+ for key, value in os.environ.items():
+ if key.startswith("GALAXY_TEST_"):
+ template_vars[key] = value
self.template_vars = template_vars
if config_str.startswith("<"):
config_file = "store.xml"
diff --git a/packages/objectstore/test-requirements.txt b/packages/objectstore/test-requirements.txt
index e079f8a6038d..8077db870b39 100644
--- a/packages/objectstore/test-requirements.txt
+++ b/packages/objectstore/test-requirements.txt
@@ -1 +1,3 @@
pytest
+boto3
+azure-storage-blob
diff --git a/pyproject.toml b/pyproject.toml
index 4799ac3ebc4c..95e88cf337bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -130,6 +130,7 @@ s3fs = "*"
ase = ">=3.18.1"
axe-selenium-python = "*"
black = "*"
+boto3 = "*"
codespell = "*"
cwltest = "*"
darker = "*"
@@ -183,6 +184,7 @@ types-python-dateutil = "*"
types-PyYAML = "*"
types-requests = "*"
types-six = "*"
+"boto3-stubs[s3]" = "*"
[tool.ruff]
target-version = "py38"
diff --git a/test/integration/objectstore/test_objectstore_datatype_upload.py b/test/integration/objectstore/test_objectstore_datatype_upload.py
index 3a17174bd66c..a383e43edaa1 100644
--- a/test/integration/objectstore/test_objectstore_datatype_upload.py
+++ b/test/integration/objectstore/test_objectstore_datatype_upload.py
@@ -234,7 +234,7 @@ def test_upload_datatype_irods_idle_connections(
# Verify the connection pool has 0 active and 1 idle connections
assert len(connection_pool.active) == 0
- assert len(connection_pool.idle) == 1
+ assert len(connection_pool.idle) in [1, 2]
# Wait for the idle connection to turn stale
time.sleep(REFRESH_TIME)
diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py
index 44564acbc2be..6fe31f79bb44 100644
--- a/test/unit/objectstore/test_objectstore.py
+++ b/test/unit/objectstore/test_objectstore.py
@@ -1,5 +1,7 @@
import os
+import shutil
import time
+from functools import wraps
from tempfile import (
mkdtemp,
mkstemp,
@@ -11,6 +13,7 @@
from requests import get
from galaxy.exceptions import ObjectInvalid
+from galaxy.objectstore import persist_extra_files_for_dataset
from galaxy.objectstore.azure_blob import AzureBlobObjectStore
from galaxy.objectstore.caching import (
CacheTarget,
@@ -19,8 +22,10 @@
reset_cache,
)
from galaxy.objectstore.cloud import Cloud
+from galaxy.objectstore.examples import get_example
from galaxy.objectstore.pithos import PithosObjectStore
from galaxy.objectstore.s3 import S3ObjectStore
+from galaxy.objectstore.s3_boto3 import S3ObjectStore as Boto3ObjectStore
from galaxy.objectstore.unittest_utils import (
Config as TestConfig,
DISK_TEST_CONFIG,
@@ -46,6 +51,11 @@ def _initialize(self):
pass
+class UninitializedBoto3ObjectStore(Boto3ObjectStore):
+ def _initialize(self):
+ pass
+
+
class UninitializedAzureBlobObjectStore(AzureBlobObjectStore):
def _initialize(self):
pass
@@ -56,6 +66,20 @@ def _initialize(self):
pass
+def patch_object_stores_to_skip_initialize(f):
+
+ @wraps(f)
+ @patch("galaxy.objectstore.s3.S3ObjectStore", UninitializedS3ObjectStore)
+ @patch("galaxy.objectstore.s3_boto3.S3ObjectStore", UninitializedBoto3ObjectStore)
+ @patch("galaxy.objectstore.pithos.PithosObjectStore", UninitializedPithosObjectStore)
+ @patch("galaxy.objectstore.cloud.Cloud", UninitializedCloudObjectStore)
+ @patch("galaxy.objectstore.azure_blob.AzureBlobObjectStore", UninitializedAzureBlobObjectStore)
+ def wrapper(*args, **kwd):
+ f(*args, **kwd)
+
+ return wrapper
+
+
def test_unlink_path():
with pytest.raises(FileNotFoundError):
unlink(uuid4().hex)
@@ -236,61 +260,8 @@ def test_disk_store_alt_name_abspath():
pass
-HIERARCHICAL_TEST_CONFIG = """
-
-
-
-
- This is our new storage cluster, check out the storage
- on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage).
-
-
-
-
-
-
-
- This is our older legacy storage cluster, check out the storage
- on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage).
-
-
-
-
-
-
-
-"""
-
-HIERARCHICAL_TEST_CONFIG_YAML = """
-type: hierarchical
-backends:
- - id: files1
- name: Newer Cool Storage
- description: |
- This is our new storage cluster, check out the storage
- on our institute's system page for [Fancy New Storage](http://computecenter.example.com/systems/fancystorage).
- type: disk
- weight: 1
- files_dir: "${temp_directory}/files1"
- extra_dirs:
- - type: temp
- path: "${temp_directory}/tmp1"
- - type: job_work
- path: "${temp_directory}/job_working_directory1"
- - id: files2
- name: Older Legacy Storage
- description: |
- This is our older legacy storage cluster, check out the storage
- on our institute's system page for [Legacy Storage](http://computecenter.example.com/systems/legacystorage).
- type: disk
- weight: 1
- files_dir: "${temp_directory}/files2"
- extra_dirs:
- - type: temp
- path: "${temp_directory}/tmp2"
- - type: job_work
- path: "${temp_directory}/job_working_directory2"
-"""
+HIERARCHICAL_TEST_CONFIG = get_example("hierarchical_simple.xml")
+HIERARCHICAL_TEST_CONFIG_YAML = get_example("hierarchical_simple.yml")
def test_hierarchical_store():
@@ -392,7 +363,6 @@ def test_mixed_private():
# Distributed object store can combine private and non-private concrete objectstores
with TestConfig(MIXED_STORE_BY_DISTRIBUTED_TEST_CONFIG) as (directory, object_store):
ids = object_store.object_store_ids()
- print(ids)
assert len(ids) == 2
ids = object_store.object_store_ids(private=True)
@@ -424,40 +394,8 @@ def test_empty_cache_targets_for_disk_nested_stores():
assert len(object_store.cache_targets()) == 0
-BADGES_TEST_1_CONFIG_XML = """
-
-
-
-
-
-
- Fast interconnects.
-
-
- Storage is backed up to tape nightly.
-
-
-"""
-
-
-BADGES_TEST_1_CONFIG_YAML = """
-type: disk
-files_dir: "${temp_directory}/files1"
-store_by: uuid
-extra_dirs:
- - type: temp
- path: "${temp_directory}/tmp1"
- - type: job_work
- path: "${temp_directory}/job_working_directory1"
-badges:
- - type: short_term
- - type: faster
- message: Fast interconnects.
- - type: less_stable
- - type: more_secure
- - type: backed_up
- message: Storage is backed up to tape nightly.
-"""
+BADGES_TEST_1_CONFIG_XML = get_example("disk_badges.xml")
+BADGES_TEST_1_CONFIG_YAML = get_example("disk_badges.yml")
def test_badges_parsing():
@@ -524,54 +462,8 @@ def test_badges_parsing_conflicts():
assert exception_raised
-DISTRIBUTED_TEST_CONFIG = """
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-"""
-
-
-DISTRIBUTED_TEST_CONFIG_YAML = """
-type: distributed
-backends:
- - id: files1
- quota:
- source: 1files
- type: disk
- weight: 2
- device: primary_disk
- files_dir: "${temp_directory}/files1"
- extra_dirs:
- - type: temp
- path: "${temp_directory}/tmp1"
- - type: job_work
- path: "${temp_directory}/job_working_directory1"
- - id: files2
- quota:
- source: 2files
- type: disk
- weight: 1
- device: primary_disk
- files_dir: "${temp_directory}/files2"
- extra_dirs:
- - type: temp
- path: "${temp_directory}/tmp2"
- - type: job_work
- path: "${temp_directory}/job_working_directory2"
-"""
+DISTRIBUTED_TEST_CONFIG = get_example("distributed_disk.xml")
+DISTRIBUTED_TEST_CONFIG_YAML = get_example("distributed_disk.yml")
def test_distributed_store():
@@ -605,7 +497,6 @@ def test_distributed_store():
device_source_map = object_store.get_device_source_map()
assert device_source_map
- print(device_source_map.backends)
assert device_source_map.get_device_id("files1") == "primary_disk"
assert device_source_map.get_device_id("files2") == "primary_disk"
@@ -616,48 +507,10 @@ def test_distributed_store_empty_cache_targets():
assert len(object_store.cache_targets()) == 0
-DISTRIBUTED_TEST_S3_CONFIG_YAML = """
-type: distributed
-backends:
- - id: files1
- weight: 1
- type: s3
- auth:
- access_key: access_moo
- secret_key: secret_cow
-
- bucket:
- name: unique_bucket_name_all_lowercase
- use_reduced_redundancy: false
-
- extra_dirs:
- - type: job_work
- path: ${temp_directory}/job_working_directory_s3
- - type: temp
- path: ${temp_directory}/tmp_s3
- - id: files2
- weight: 1
- type: s3
- auth:
- access_key: access_moo
- secret_key: secret_cow
-
- bucket:
- name: unique_bucket_name_all_lowercase_2
- use_reduced_redundancy: false
-
- extra_dirs:
- - type: job_work
- path: ${temp_directory}/job_working_directory_s3_2
- - type: temp
- path: ${temp_directory}/tmp_s3_2
-"""
-
-
-@patch("galaxy.objectstore.s3.S3ObjectStore", UninitializedS3ObjectStore)
+@patch_object_stores_to_skip_initialize
def test_distributed_store_with_cache_targets():
- for config_str in [DISTRIBUTED_TEST_S3_CONFIG_YAML]:
- with TestConfig(config_str) as (directory, object_store):
+ for config_str in [get_example("distributed_s3.yml")]:
+ with TestConfig(config_str) as (_, object_store):
assert len(object_store.cache_targets()) == 2
@@ -691,37 +544,14 @@ def test_hiercachical_backend_must_share_quota_source():
assert the_exception is not None
-PITHOS_TEST_CONFIG = """
-
-
-
-
-
-
-"""
-
-
-PITHOS_TEST_CONFIG_YAML = """
-type: pithos
-auth:
- url: http://example.org/
- token: extoken123
-
-container:
- name: foo
- project: cow
-
-extra_dirs:
- - type: temp
- path: database/tmp_pithos
- - type: job_work
- path: database/working_pithos
-"""
+PITHOS_TEST_CONFIG = get_example("pithos_simple.xml")
+PITHOS_TEST_CONFIG_YAML = get_example("pithos_simple.yml")
+@patch_object_stores_to_skip_initialize
def test_config_parse_pithos():
for config_str in [PITHOS_TEST_CONFIG, PITHOS_TEST_CONFIG_YAML]:
- with TestConfig(config_str, clazz=UninitializedPithosObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
configured_config_dict = object_store.config_dict
_assert_has_keys(configured_config_dict, ["auth", "container", "extra_dirs"])
@@ -755,42 +585,14 @@ def test_config_parse_pithos():
assert len(extra_dirs) == 2
-S3_TEST_CONFIG = """
-
-
-
-
-
-
-"""
-
-
-S3_TEST_CONFIG_YAML = """
-type: s3
-private: true
-auth:
- access_key: access_moo
- secret_key: secret_cow
-
-bucket:
- name: unique_bucket_name_all_lowercase
- use_reduced_redundancy: false
-
-cache:
- path: database/object_store_cache
- size: 1000
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_s3
-- type: temp
- path: database/tmp_s3
-"""
+S3_TEST_CONFIG = get_example("s3_simple.xml")
+S3_TEST_CONFIG_YAML = get_example("s3_simple.yml")
+@patch_object_stores_to_skip_initialize
def test_config_parse_s3():
for config_str in [S3_TEST_CONFIG, S3_TEST_CONFIG_YAML]:
- with TestConfig(config_str, clazz=UninitializedS3ObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
assert object_store.private
assert object_store.access_key == "access_moo"
assert object_store.secret_key == "secret_cow"
@@ -838,140 +640,134 @@ def test_config_parse_s3():
assert len(extra_dirs) == 2
-S3_DEFAULT_CACHE_TEST_CONFIG = """
-
-
-
-
-
-"""
-
-
-S3_DEFAULT_CACHE_TEST_CONFIG_YAML = """
-type: s3
-private: true
-auth:
- access_key: access_moo
- secret_key: secret_cow
-
-bucket:
- name: unique_bucket_name_all_lowercase
- use_reduced_redundancy: false
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_s3
-- type: temp
- path: database/tmp_s3
-"""
+S3_DEFAULT_CACHE_TEST_CONFIG = get_example("s3_global_cache.xml")
+S3_DEFAULT_CACHE_TEST_CONFIG_YAML = get_example("s3_global_cache.yml")
+@patch_object_stores_to_skip_initialize
def test_config_parse_s3_with_default_cache():
for config_str in [S3_DEFAULT_CACHE_TEST_CONFIG, S3_DEFAULT_CACHE_TEST_CONFIG_YAML]:
- with TestConfig(config_str, clazz=UninitializedS3ObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
assert object_store.cache_size == -1
assert object_store.staging_path == directory.global_config.object_store_cache_path
-CLOUD_AWS_TEST_CONFIG = """
-
-
-
-
-
-
-"""
-
-
-CLOUD_AWS_TEST_CONFIG_YAML = """
-type: cloud
-provider: aws
-auth:
- access_key: access_moo
- secret_key: secret_cow
-
-bucket:
- name: unique_bucket_name_all_lowercase
- use_reduced_redundancy: false
-
-cache:
- path: database/object_store_cache
- size: 1000
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_cloud
-- type: temp
- path: database/tmp_cloud
-"""
+@patch_object_stores_to_skip_initialize
+def test_config_parse_boto3():
+ for config_str in [get_example("boto3_simple.xml"), get_example("boto3_simple.yml")]:
+ with TestConfig(config_str) as (directory, object_store):
+ assert object_store.access_key == "access_moo"
+ assert object_store.secret_key == "secret_cow"
+ assert object_store.bucket == "unique_bucket_name_all_lowercase"
-CLOUD_AZURE_TEST_CONFIG = """
-
-
-
-
-
-
-"""
+ # defaults to AWS
+ assert object_store.endpoint_url is None
-CLOUD_AZURE_TEST_CONFIG_YAML = """
-type: cloud
-provider: azure
-auth:
- subscription_id: a_sub_id
- client_id: and_a_client_id
- secret: and_a_secret_key
- tenant: and_some_tenant_info
+ cache_target = object_store.cache_target
+ assert cache_target.size == 1000
+ assert cache_target.path == "database/object_store_cache"
+ assert object_store.extra_dirs["job_work"] == "database/job_working_directory_s3"
+ assert object_store.extra_dirs["temp"] == "database/tmp_s3"
-bucket:
- name: unique_bucket_name_all_lowercase
- use_reduced_redundancy: false
+ as_dict = object_store.to_dict()
+ _assert_has_keys(as_dict, ["auth", "bucket", "connection", "cache", "extra_dirs", "type"])
-cache:
- path: database/object_store_cache
- size: 1000
+ _assert_key_has_value(as_dict, "type", "boto3")
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_cloud
-- type: temp
- path: database/tmp_cloud
-"""
+ auth_dict = as_dict["auth"]
+ bucket_dict = as_dict["bucket"]
+ cache_dict = as_dict["cache"]
+ _assert_key_has_value(auth_dict, "access_key", "access_moo")
+ _assert_key_has_value(auth_dict, "secret_key", "secret_cow")
-CLOUD_GOOGLE_TEST_CONFIG = """
-
-
-
-
-
-
-"""
+ _assert_key_has_value(bucket_dict, "name", "unique_bucket_name_all_lowercase")
-CLOUD_GOOGLE_TEST_CONFIG_YAML = """
-type: cloud
-provider: google
-auth:
- credentials_file: gcp.config
+ _assert_key_has_value(cache_dict, "size", 1000)
+ _assert_key_has_value(cache_dict, "path", "database/object_store_cache")
-bucket:
- name: unique_bucket_name_all_lowercase
- use_reduced_redundancy: false
+ extra_dirs = as_dict["extra_dirs"]
+ assert len(extra_dirs) == 2
-cache:
- path: database/object_store_cache
- size: 1000
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_cloud
-- type: temp
- path: database/tmp_cloud
-"""
+@patch_object_stores_to_skip_initialize
+def test_config_parse_boto3_custom_connection():
+ for config_str in [get_example("boto3_custom_connection.xml"), get_example("boto3_custom_connection.yml")]:
+ with TestConfig(config_str) as (directory, object_store):
+ assert object_store.endpoint_url == "https://s3.example.org/"
+ assert object_store.region == "the_example_region"
+@patch_object_stores_to_skip_initialize
+def test_config_parse_boto3_merged_transfer_options():
+ for config_str in [
+ get_example("boto3_merged_transfer_options.xml"),
+ get_example("boto3_merged_transfer_options.yml"),
+ ]:
+ with TestConfig(config_str) as (directory, object_store):
+ as_dict = object_store.to_dict()
+ transfer_dict = as_dict["transfer"]
+ assert transfer_dict["multipart_threshold"] == 13
+ assert transfer_dict["max_concurrency"] == 13
+ assert transfer_dict["multipart_chunksize"] == 13
+ assert transfer_dict["num_download_attempts"] == 13
+ assert transfer_dict["max_io_queue"] == 13
+ assert transfer_dict["io_chunksize"] == 13
+ assert transfer_dict["use_threads"] is False
+ assert transfer_dict["max_bandwidth"] == 13
+
+ for transfer_type in ["upload", "download"]:
+ transfer_config = object_store._transfer_config(transfer_type)
+ assert transfer_config.multipart_threshold == 13
+ assert transfer_config.max_concurrency == 13
+ assert transfer_config.multipart_chunksize == 13
+ assert transfer_config.num_download_attempts == 13
+ assert transfer_config.max_io_queue == 13
+ assert transfer_config.io_chunksize == 13
+ assert transfer_config.use_threads is False
+ assert transfer_config.max_bandwidth == 13
+
+
+@patch_object_stores_to_skip_initialize
+def test_config_parse_boto3_separated_transfer_options():
+ for config_str in [
+ get_example("boto3_separated_transfer_options.xml"),
+ get_example("boto3_separated_transfer_options.yml"),
+ ]:
+ with TestConfig(config_str) as (directory, object_store):
+ transfer_config = object_store._transfer_config("upload")
+ assert transfer_config.multipart_threshold == 13
+ assert transfer_config.max_concurrency == 13
+ assert transfer_config.multipart_chunksize == 13
+ assert transfer_config.num_download_attempts == 13
+ assert transfer_config.max_io_queue == 13
+ assert transfer_config.io_chunksize == 13
+ assert transfer_config.use_threads is False
+ assert transfer_config.max_bandwidth == 13
+
+ transfer_config = object_store._transfer_config("download")
+ assert transfer_config.multipart_threshold == 14
+ assert transfer_config.max_concurrency == 14
+ assert transfer_config.multipart_chunksize == 14
+ assert transfer_config.num_download_attempts == 14
+ assert transfer_config.max_io_queue == 14
+ assert transfer_config.io_chunksize == 14
+ assert transfer_config.use_threads is True
+ assert transfer_config.max_bandwidth == 14
+
+
+CLOUD_AWS_TEST_CONFIG = get_example("cloud_aws_simple.xml")
+CLOUD_AWS_TEST_CONFIG_YAML = get_example("cloud_aws_simple.yml")
+
+CLOUD_AZURE_TEST_CONFIG = get_example("cloud_azure_simple.xml")
+CLOUD_AZURE_TEST_CONFIG_YAML = get_example("cloud_azure_simple.yml")
+
+CLOUD_GOOGLE_TEST_CONFIG = get_example("cloud_gcp_simple.xml")
+CLOUD_GOOGLE_TEST_CONFIG_YAML = get_example("cloud_gcp_simple.yml")
+
+
+@patch_object_stores_to_skip_initialize
def test_config_parse_cloud():
for config_str in [
CLOUD_AWS_TEST_CONFIG,
@@ -988,7 +784,7 @@ def test_config_parse_cloud():
path = os.path.join(tmpdir, "gcp.config")
open(path, "w").write("some_gcp_config")
config_str = config_str.replace("gcp.config", path)
- with TestConfig(config_str, clazz=UninitializedCloudObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
assert object_store.bucket_name == "unique_bucket_name_all_lowercase"
assert object_store.use_rr is False
@@ -1029,19 +825,13 @@ def test_config_parse_cloud():
assert len(extra_dirs) == 2
-CLOUD_AWS_NO_AUTH_TEST_CONFIG = """
-
-
-
-
-
-
-"""
+CLOUD_AWS_NO_AUTH_TEST_CONFIG = get_example("cloud_aws_no_auth.xml")
+@patch_object_stores_to_skip_initialize
def test_config_parse_cloud_noauth_for_aws():
for config_str in [CLOUD_AWS_NO_AUTH_TEST_CONFIG]:
- with TestConfig(config_str, clazz=UninitializedCloudObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
assert object_store.bucket_name == "unique_bucket_name_all_lowercase"
assert object_store.use_rr is False
@@ -1062,7 +852,6 @@ def test_config_parse_cloud_noauth_for_aws():
provider = as_dict["provider"]
assert provider == "aws"
- print(auth_dict["access_key"])
_assert_key_has_value(auth_dict, "access_key", None)
_assert_key_has_value(auth_dict, "secret_key", None)
@@ -1076,62 +865,29 @@ def test_config_parse_cloud_noauth_for_aws():
assert len(extra_dirs) == 2
-CLOUD_AWS_NO_CACHE_TEST_CONFIG = """
-
-
-
-
-
-"""
+CLOUD_AWS_NO_CACHE_TEST_CONFIG = get_example("cloud_aws_default_cache.xml")
+@patch_object_stores_to_skip_initialize
def test_config_parse_cloud_no_cache_for_aws():
for config_str in [CLOUD_AWS_NO_CACHE_TEST_CONFIG]:
- with TestConfig(config_str, clazz=UninitializedCloudObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
assert object_store.staging_path == directory.global_config.object_store_cache_path
assert object_store.cache_size == -1
-AZURE_BLOB_TEST_CONFIG = """
-
-
-
-
-
-
-"""
-
-
-AZURE_BLOB_TEST_CONFIG_YAML = """
-type: azure_blob
-auth:
- account_name: azureact
- account_key: password123
-
-container:
- name: unique_container_name
- max_chunk_size: 250
-
-cache:
- path: database/object_store_cache
- size: 100
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_azure
-- type: temp
- path: database/tmp_azure
-"""
+AZURE_BLOB_TEST_CONFIG = get_example("azure_simple.xml")
+AZURE_BLOB_TEST_CONFIG_YAML = get_example("azure_simple.yml")
+@patch_object_stores_to_skip_initialize
def test_config_parse_azure():
for config_str in [AZURE_BLOB_TEST_CONFIG, AZURE_BLOB_TEST_CONFIG_YAML]:
- with TestConfig(config_str, clazz=UninitializedAzureBlobObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
assert object_store.account_name == "azureact"
assert object_store.account_key == "password123"
assert object_store.container_name == "unique_container_name"
- assert object_store.max_chunk_size == 250
cache_target = object_store.cache_target
assert cache_target.size == 100
@@ -1152,7 +908,6 @@ def test_config_parse_azure():
_assert_key_has_value(auth_dict, "account_key", "password123")
_assert_key_has_value(container_dict, "name", "unique_container_name")
- _assert_key_has_value(container_dict, "max_chunk_size", 250)
_assert_key_has_value(cache_dict, "size", 100)
_assert_key_has_value(cache_dict, "path", "database/object_store_cache")
@@ -1161,6 +916,18 @@ def test_config_parse_azure():
assert len(extra_dirs) == 2
+@patch_object_stores_to_skip_initialize
+def test_config_parse_azure_transfer():
+ for config_str in [get_example("azure_transfer.xml"), get_example("azure_transfer.yml")]:
+ with TestConfig(config_str) as (directory, object_store):
+ as_dict = object_store.to_dict()["transfer"]
+ assert as_dict["download_max_concurrency"] == 1
+ assert as_dict["upload_max_concurrency"] == 2
+ assert as_dict["max_single_put_size"] == 10
+ assert as_dict["max_single_get_size"] == 20
+ assert as_dict["max_block_size"] == 3
+
+
def test_cache_monitor_thread(tmp_path):
cache_dir = tmp_path
path = cache_dir / "a_file_0"
@@ -1209,36 +976,14 @@ def test_fits_in_cache_check(tmp_path):
assert noop_cache_target.fits_in_cache(1024 * 1024 * 1024 * 100)
-AZURE_BLOB_NO_CACHE_TEST_CONFIG = """
-
-
-
-
-
-"""
-
-
-AZURE_BLOB_NO_CACHE_TEST_CONFIG_YAML = """
-type: azure_blob
-auth:
- account_name: azureact
- account_key: password123
-
-container:
- name: unique_container_name
- max_chunk_size: 250
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_azure
-- type: temp
- path: database/tmp_azure
-"""
+AZURE_BLOB_NO_CACHE_TEST_CONFIG = get_example("azure_default_cache.xml")
+AZURE_BLOB_NO_CACHE_TEST_CONFIG_YAML = get_example("azure_default_cache.yml")
+@patch_object_stores_to_skip_initialize
def test_config_parse_azure_no_cache():
for config_str in [AZURE_BLOB_NO_CACHE_TEST_CONFIG, AZURE_BLOB_NO_CACHE_TEST_CONFIG_YAML]:
- with TestConfig(config_str, clazz=UninitializedAzureBlobObjectStore) as (directory, object_store):
+ with TestConfig(config_str) as (directory, object_store):
assert object_store.cache_size == -1
assert object_store.staging_path == directory.global_config.object_store_cache_path
@@ -1321,6 +1066,52 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_
reset_cache(object_store.cache_target)
assert not object_store.exists(to_delete_dataset)
+ # Test bigger file to force multi-process.
+ big_file_dataset = MockDataset(6)
+ size = 1024
+ path = tmp_path / "big_file.bytes"
+ with path.open("wb") as f:
+ f.write(os.urandom(size))
+ object_store.update_from_file(big_file_dataset, file_name=hello_path, create=True)
+
+ extra_files_dataset = MockDataset(7)
+ object_store.create(extra_files_dataset)
+ extra = tmp_path / "extra"
+ extra.mkdir()
+ extra_file = extra / "new_value.txt"
+ extra_file.write_text("My new value")
+
+ persist_extra_files_for_dataset(
+ object_store,
+ extra,
+ extra_files_dataset, # type: ignore[arg-type,unused-ignore]
+ extra_files_dataset._extra_files_rel_path,
+ )
+
+ # The following checks used to exhibit different behavior depending
+ # on how the cache was cleaned - removing the whole directory vs
+ # just cleaning up files the way Galaxy's internal caching works with
+ # reset_cache. So we test both here.
+
+ # hard reset
+ shutil.rmtree(object_store.cache_target.path)
+ os.makedirs(object_store.cache_target.path)
+
+ extra_path = _extra_file_path(object_store, extra_files_dataset)
+ assert os.path.exists(extra_path)
+ expected_extra_file = os.path.join(extra_path, "new_value.txt")
+ assert os.path.exists(expected_extra_file)
+ assert open(expected_extra_file).read() == "My new value"
+
+ # Redo the above test with Galaxy's reset_cache which leaves empty directories
+ # around.
+ reset_cache(object_store.cache_target)
+ extra_path = _extra_file_path(object_store, extra_files_dataset)
+ assert os.path.exists(extra_path)
+ expected_extra_file = os.path.join(extra_path, "new_value.txt")
+ assert os.path.exists(expected_extra_file)
+ assert open(expected_extra_file).read() == "My new value"
+
# Test get_object_url returns a read-only URL
url = object_store.get_object_url(hello_world_dataset)
if check_get_url:
@@ -1329,6 +1120,13 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_
assert response.text == "Hello World!"
+def _extra_file_path(object_store, dataset):
+ # invoke the magic calls the model layer would invoke here...
+ if object_store.exists(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path):
+ return object_store.get_filename(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path)
+ return object_store.construct_path(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path, in_cache=True)
+
+
def verify_object_store_functionality(tmp_path, object_store, check_get_url=True):
# Test no dataset with id 1 exists.
absent_dataset = MockDataset(1)
@@ -1382,238 +1180,115 @@ def verify_object_store_functionality(tmp_path, object_store, check_get_url=True
assert response.text == "Hello World!"
-AZURE_BLOB_TEMPLATE_TEST_CONFIG_YAML = """
-type: azure_blob
-store_by: uuid
-auth:
- account_name: ${account_name}
- account_key: ${account_key}
-
-container:
- name: ${container_name}
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_azure
-- type: temp
- path: database/tmp_azure
-"""
+def integration_test_config(example_filename: str):
+ return TestConfig(get_example(example_filename), inject_galaxy_test_env=True)
@skip_unless_environ("GALAXY_TEST_AZURE_CONTAINER_NAME")
@skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_KEY")
@skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_NAME")
def test_real_azure_blob_store(tmp_path):
- template_vars = {
- "container_name": os.environ["GALAXY_TEST_AZURE_CONTAINER_NAME"],
- "account_key": os.environ["GALAXY_TEST_AZURE_ACCOUNT_KEY"],
- "account_name": os.environ["GALAXY_TEST_AZURE_ACCOUNT_NAME"],
- }
- with TestConfig(AZURE_BLOB_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store):
+ with integration_test_config("azure_integration_test.yml") as (_, object_store):
verify_caching_object_store_functionality(tmp_path, object_store)
-AZURE_BLOB_TEMPLATE_WITH_ACCOUNT_URL_TEST_CONFIG_YAML = """
-type: azure_blob
-store_by: uuid
-auth:
- account_name: ${account_name}
- account_key: ${account_key}
- account_url: ${account_url}
-
-container:
- name: ${container_name}
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_azure
-- type: temp
- path: database/tmp_azure
-"""
-
-
@skip_unless_environ("GALAXY_TEST_AZURE_CONTAINER_NAME")
@skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_KEY")
@skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_NAME")
@skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_URL")
def test_real_azure_blob_store_with_account_url(tmp_path):
- template_vars = {
- "container_name": os.environ["GALAXY_TEST_AZURE_CONTAINER_NAME"],
- "account_key": os.environ["GALAXY_TEST_AZURE_ACCOUNT_KEY"],
- "account_name": os.environ["GALAXY_TEST_AZURE_ACCOUNT_NAME"],
- "account_url": os.environ["GALAXY_TEST_AZURE_ACCOUNT_URL"],
- }
- with TestConfig(AZURE_BLOB_TEMPLATE_WITH_ACCOUNT_URL_TEST_CONFIG_YAML, template_vars=template_vars) as (
+ with integration_test_config("azure_integration_test_with_account_url.yml") as (
_,
object_store,
):
verify_caching_object_store_functionality(tmp_path, object_store)
-AZURE_BLOB_IN_HIERARCHICAL_TEMPLATE_TEST_CONFIG_YAML = """
-type: distributed
-backends:
-- type: azure_blob
- id: azure1
- store_by: uuid
- name: Azure Store 1
- allow_selection: true
- weight: 1
- auth:
- account_name: ${account_name}
- account_key: ${account_key}
-
- container:
- name: ${container_name}
-
- extra_dirs:
- - type: job_work
- path: database/job_working_directory_azure_1
- - type: temp
- path: database/tmp_azure_1
-- type: azure_blob
- id: azure2
- store_by: uuid
- name: Azure Store 2
- allow_selection: true
- weight: 1
- auth:
- account_name: ${account_name}
- account_key: ${account_key}
-
- container:
- name: ${container_name}
-
- extra_dirs:
- - type: job_work
- path: database/job_working_directory_azure_2
- - type: temp
- path: database/tmp_azure_2
-"""
-
-
@skip_unless_environ("GALAXY_TEST_AZURE_CONTAINER_NAME")
@skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_KEY")
@skip_unless_environ("GALAXY_TEST_AZURE_ACCOUNT_NAME")
def test_real_azure_blob_store_in_hierarchical(tmp_path):
- template_vars = {
- "container_name": os.environ["GALAXY_TEST_AZURE_CONTAINER_NAME"],
- "account_key": os.environ["GALAXY_TEST_AZURE_ACCOUNT_KEY"],
- "account_name": os.environ["GALAXY_TEST_AZURE_ACCOUNT_NAME"],
- }
- with TestConfig(AZURE_BLOB_IN_HIERARCHICAL_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (
- _,
- object_store,
- ):
+ with integration_test_config("azure_integration_test_distributed.yml") as (_, object_store):
verify_object_store_functionality(tmp_path, object_store)
-AMAZON_S3_SIMPLE_TEMPLATE_TEST_CONFIG_YAML = """
-type: aws_s3
-store_by: uuid
-auth:
- access_key: ${access_key}
- secret_key: ${secret_key}
-
-bucket:
- name: ${bucket}
-
-connection:
- region: ${region}
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_azure
-- type: temp
- path: database/tmp_azure
-"""
-
-
@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY")
@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY")
@skip_unless_environ("GALAXY_TEST_AWS_BUCKET")
@skip_unless_environ("GALAXY_TEST_AWS_REGION")
def test_real_aws_s3_store(tmp_path):
- template_vars = {
- "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"],
- "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"],
- "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"],
- "region": os.environ["GALAXY_TEST_AWS_REGION"],
- }
- with TestConfig(AMAZON_S3_SIMPLE_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store):
+ with integration_test_config("aws_s3_integration_test.yml") as (_, object_store):
verify_caching_object_store_functionality(tmp_path, object_store)
-AMAZON_CLOUDBRIDGE_TEMPLATE_TEST_CONFIG_YAML = """
-type: cloud
-store_by: uuid
-provider: aws
-auth:
- access_key: ${access_key}
- secret_key: ${secret_key}
+@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY")
+@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY")
+@skip_unless_environ("GALAXY_TEST_AWS_BUCKET")
+def test_real_aws_s3_store_boto3(tmp_path):
+ with integration_test_config("boto3_integration_test_aws.yml") as (_, object_store):
+ verify_caching_object_store_functionality(tmp_path, object_store)
-bucket:
- name: ${bucket}
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_azure
-- type: temp
- path: database/tmp_azure
-"""
+@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY")
+@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY")
+@skip_unless_environ("GALAXY_TEST_AWS_BUCKET")
+def test_real_aws_s3_store_boto3_multipart(tmp_path):
+ with integration_test_config("boto3_integration_test_multithreaded.yml") as (_, object_store):
+ verify_caching_object_store_functionality(tmp_path, object_store)
+@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY")
+@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY")
+def test_real_aws_s3_store_boto3_new_bucket(tmp_path):
+ with integration_test_config("boto3_integration_test_aws_new_bucket.yml") as (_, object_store):
+ verify_caching_object_store_functionality(tmp_path, object_store)
+
+
+# this test fails if you have axel installed because axel requires URLs to work and that requires
+# setting a region with the cloudbridge store.
@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY")
@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY")
@skip_unless_environ("GALAXY_TEST_AWS_BUCKET")
def test_aws_via_cloudbridge_store(tmp_path):
- template_vars = {
- "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"],
- "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"],
- "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"],
- }
- with TestConfig(AMAZON_CLOUDBRIDGE_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store):
+ with integration_test_config("cloud_integration_test_aws.yml") as (_, object_store):
# disabling get_object_url check - cloudbridge in this config assumes the region
# is us-east-1 and generates a URL for that region. This functionality works and can
# be tested if a region is specified in the configuration (see next config and test case).
verify_caching_object_store_functionality(tmp_path, object_store, check_get_url=False)
-AMAZON_CLOUDBRIDGE_WITH_REGION_TEMPLATE_TEST_CONFIG_YAML = """
-type: cloud
-store_by: uuid
-provider: aws
-auth:
- access_key: ${access_key}
- secret_key: ${secret_key}
- region: ${region}
-
-bucket:
- name: ${bucket}
-
-extra_dirs:
-- type: job_work
- path: database/job_working_directory_azure
-- type: temp
- path: database/tmp_azure
-"""
-
-
@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY")
@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY")
@skip_unless_environ("GALAXY_TEST_AWS_BUCKET")
@skip_unless_environ("GALAXY_TEST_AWS_REGION")
def test_aws_via_cloudbridge_store_with_region(tmp_path):
- template_vars = {
- "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"],
- "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"],
- "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"],
- "region": os.environ["GALAXY_TEST_AWS_REGION"],
- }
- with TestConfig(AMAZON_CLOUDBRIDGE_WITH_REGION_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (
- _,
- object_store,
- ):
+ with integration_test_config("cloud_integration_test_aws_with_region.yml") as (_, object_store):
+ verify_caching_object_store_functionality(tmp_path, object_store)
+
+
+@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY")
+@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY")
+@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET")
+def test_gcp_via_s3_interop(tmp_path):
+ with integration_test_config("gcp_s3_integration_test.yml") as (_, object_store):
+ verify_caching_object_store_functionality(tmp_path, object_store)
+
+
+@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY")
+@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY")
+@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET")
+def test_gcp_via_s3_interop_and_boto3(tmp_path):
+ with integration_test_config("gcp_boto3_integration_test.yml") as (_, object_store):
+ verify_caching_object_store_functionality(tmp_path, object_store)
+
+
+# Ensure's boto3 will use legacy connection parameters that the generic_s3 object store
+# would consume.
+@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY")
+@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY")
+@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET")
+def test_gcp_via_s3_interop_and_boto3_with_legacy_params(tmp_path):
+ with integration_test_config("gcp_boto3_integration_test_legacy_params.yml") as (_, object_store):
verify_caching_object_store_functionality(tmp_path, object_store)
@@ -1628,6 +1303,10 @@ def rel_path_for_uuid_test(self):
rel_path = os.path.join(*directory_hash_id(self.uuid))
return rel_path
+ @property
+ def _extra_files_rel_path(self):
+ return f"dataset_{self.uuid}_files"
+
def _assert_has_keys(the_dict, keys):
for key in keys: