Skip to content

Commit

Permalink
Merge pull request #18136 from jmchilton/object_store_unit_testing
Browse files Browse the repository at this point in the history
More unit testing for object store stuff.
  • Loading branch information
mvdbeek authored May 15, 2024
2 parents 919bf37 + 55bd5c7 commit 8be09c8
Show file tree
Hide file tree
Showing 66 changed files with 2,373 additions and 2,459 deletions.
87 changes: 84 additions & 3 deletions lib/galaxy/config/sample/object_store_conf.sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,64 @@ backends:
store_by: uuid
files_dir: /old-fs/galaxy/files


# There are now four ways to access S3 related services. Two are
# suitable just for AWS services (aws_s3 & cloud), one is
# more suited for non-AWS S3 compatible services (generic_s3),
# and finally boto3 gracefully handles either scenario.
#
# boto3 is built on the newest and most widely used Python client
# outside of Galaxy. It has advanced transfer options and is likely
# the client you should use for new setup. generic_s3 and aws_s3
# have existed in Galaxy for longer and could perhaps be considered
# more battle tested. Both boto3 and generic_s3 have been tested
# with multiple non-AWS APIs including minio and GCP. The cloud
# implementation is based on CloudBridge and is still supported
# and has been recently tested - the downside is mostly the advanced
# multi-threaded processing options of boto3 are not available
# and it has not been battle tested like aws_s3.

#
# Sample AWS S3 Object Store configuration (newest boto3 client)
#
type: boto3
auth:
access_key: ...
secret_key: ...
bucket:
name: unique_bucket_name_all_lowercase
connection: # not strictly needed but more of the API works with this.
region: us-east-1
transfer:
multipart_threshold: 10000000
download_max_concurrency: 5
upload_max_concurrency: 10
# any of these options:
# multipart_threshold, max_concurrency, multipart_chunksize,
# num_download_attempts, max_io_queue, io_chunksize, use_threads,
# and max_bandwidth
# can be set. By default they will apply to uploads and downloads
# but they can be prefixed with upload_ or download_ as shown above
# to apply to just one scenario. More information about these parameters
# can be found at:
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig

cache:
path: database/object_store_cache_s3
size: 1000
cache_updated_data: true
extra_dirs:
- type: job_work
path: database/job_working_directory_s3



#
# Sample AWS S3 Object Store configuration
# Sample AWS S3 Object Store configuration (legacy boto implementation)
#

# This implementation will use axel automatically for file transfers if it is on
# Galaxy's path. Otherwise, it will use various python-based strategies for multi-part
# upload of large uploads but all downloads will be single threaded.
type: aws_s3
auth:
access_key: ...
Expand All @@ -147,6 +201,8 @@ bucket:
name: unique_bucket_name_all_lowercase
use_reduced_redundancy: false
max_chunk_size: 250
connection: # not strictly needed but more of the API works with this.
region: us-east-1
cache:
path: database/object_store_cache_s3
size: 1000
Expand Down Expand Up @@ -182,7 +238,32 @@ extra_dirs:
path: database/job_working_directory_irods

#
# Sample non-AWS S3 Object Store (e.g. swift) configuration
# Sample non-AWS S3 Object Store (e.g. swift) configuration (boto3)
#

type: boto3
auth:
access_key: ...
secret_key: ...
bucket:
name: unique_bucket_name_all_lowercase
connection:
endpoint_url: https://swift.example.org:6000/
# region: some services may make use of region is specified.
# older style host, port, secure, and conn_path available to generic_s3 work
# here also - Galaxy will just infer a endpoint_url from those.
cache:
path: database/object_store_cache_swift
size: 1000
cache_updated_data: true
# transfer: # see transfer options for boto3 above in AWS configuration.
extra_dirs:
- type: job_work
path: database/job_working_directory_swift


#
# Sample non-AWS S3 Object Store (e.g. swift) configuration (legacy boto client)
#

type: generic_s3
Expand Down
3 changes: 3 additions & 0 deletions lib/galaxy/dependencies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ def check_python_pam(self):
def check_azure_storage(self):
return "azure_blob" in self.object_stores

def check_boto3(self):
return "boto3" in self.object_stores

def check_kamaki(self):
return "pithos" in self.object_stores

Expand Down
1 change: 1 addition & 0 deletions lib/galaxy/dependencies/dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ babel==2.14.0 ; python_version >= "3.8" and python_version < "3.13"
backports-tarfile==1.1.1 ; python_version >= "3.8" and python_version < "3.12"
backports-zoneinfo==0.2.1 ; python_version >= "3.8" and python_version < "3.9"
black==24.4.2 ; python_version >= "3.8" and python_version < "3.13"
boto3==1.34.69 ; python_version >= "3.8" and python_version < "3.13"
build==1.2.1 ; python_version >= "3.8" and python_version < "3.13"
cachecontrol[filecache]==0.14.0 ; python_version >= "3.8" and python_version < "3.13"
certifi==2024.2.2 ; python_version >= "3.8" and python_version < "3.13"
Expand Down
62 changes: 32 additions & 30 deletions lib/galaxy/objectstore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@
from .caching import CacheTarget

if TYPE_CHECKING:
from galaxy.model import DatasetInstance
from galaxy.model import (
Dataset,
DatasetInstance,
)

NO_SESSION_ERROR_MESSAGE = (
"Attempted to 'create' object store entity in configuration with no database session present."
Expand Down Expand Up @@ -373,16 +376,6 @@ def shutdown(self):
"""Close any connections for this ObjectStore."""
self.running = False

def file_ready(
self, obj, base_dir=None, dir_only=False, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False
):
"""
Check if a file corresponding to a dataset is ready to be used.
Return True if so, False otherwise
"""
return True

@classmethod
def parse_xml(clazz, config_xml):
"""Parse an XML description of a configuration for this object store.
Expand Down Expand Up @@ -938,10 +931,6 @@ def _exists(self, obj, **kwargs):
"""Determine if the `obj` exists in any of the backends."""
return self._call_method("_exists", obj, False, False, **kwargs)

def file_ready(self, obj, **kwargs):
"""Determine if the file for `obj` is ready to be used by any of the backends."""
return self._call_method("file_ready", obj, False, False, **kwargs)

def _create(self, obj, **kwargs):
"""Create a backing file in a random backend."""
objectstore = random.choice(list(self.backends.values()))
Expand Down Expand Up @@ -1400,6 +1389,10 @@ def type_to_object_store_class(store: str, fsmon: bool = False) -> Tuple[Type[Ba
objectstore_constructor_kwds = {}
if store == "disk":
objectstore_class = DiskObjectStore
elif store == "boto3":
from .s3_boto3 import S3ObjectStore as Boto3ObjectStore

objectstore_class = Boto3ObjectStore
elif store in ["s3", "aws_s3"]:
from .s3 import S3ObjectStore

Expand Down Expand Up @@ -1672,18 +1665,27 @@ def persist_extra_files(
if not extra_files_path_name:
extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store)
assert extra_files_path_name
for root, _dirs, files in safe_walk(src_extra_files_path):
extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path))
extra_dir = os.path.normpath(extra_dir)
for f in files:
if not in_directory(f, src_extra_files_path):
# Unclear if this can ever happen if we use safe_walk ... probably not ?
raise MalformedContents(f"Invalid dataset path: {f}")
object_store.update_from_file(
primary_data.dataset,
extra_dir=extra_dir,
alt_name=f,
file_name=os.path.join(root, f),
create=True,
preserve_symlinks=True,
)
persist_extra_files_for_dataset(object_store, src_extra_files_path, primary_data.dataset, extra_files_path_name)


def persist_extra_files_for_dataset(
object_store: ObjectStore,
src_extra_files_path: str,
dataset: "Dataset",
extra_files_path_name: str,
):
for root, _dirs, files in safe_walk(src_extra_files_path):
extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path))
extra_dir = os.path.normpath(extra_dir)
for f in files:
if not in_directory(f, src_extra_files_path):
# Unclear if this can ever happen if we use safe_walk ... probably not ?
raise MalformedContents(f"Invalid dataset path: {f}")
object_store.update_from_file(
dataset,
extra_dir=extra_dir,
alt_name=f,
file_name=os.path.join(root, f),
create=True,
preserve_symlinks=True,
)
Loading

0 comments on commit 8be09c8

Please sign in to comment.