Merge pull request #18136 from jmchilton/object_store_unit_testing

More unit testing for object store stuff.
galaxyproject · May 15, 2024 · 8be09c8 · 8be09c8
2 parents 919bf37 + 55bd5c7
commit 8be09c8
Show file tree

Hide file tree

Showing 66 changed files with 2,373 additions and 2,459 deletions.
diff --git a/lib/galaxy/config/sample/object_store_conf.sample.yml b/lib/galaxy/config/sample/object_store_conf.sample.yml
@@ -135,10 +135,64 @@ backends:
     store_by: uuid
     files_dir: /old-fs/galaxy/files
 
+
+# There are now four ways to access S3 related services. Two are
+# suitable just for AWS services (aws_s3 & cloud), one is
+# more suited for non-AWS S3 compatible services (generic_s3),
+# and finally boto3 gracefully handles either scenario.
+# 
+# boto3 is built on the newest and most widely used Python client
+# outside of Galaxy. It has advanced transfer options and is likely
+# the client you should use for new setup. generic_s3 and aws_s3
+# have existed in Galaxy for longer and could perhaps be considered
+# more battle tested. Both boto3 and generic_s3 have been tested
+# with multiple non-AWS APIs including minio and GCP. The cloud
+# implementation is based on CloudBridge and is still supported
+# and has been recently tested - the downside is mostly the advanced
+# multi-threaded processing options of boto3 are not available
+# and it has not been battle tested like aws_s3.
+
+#
+# Sample AWS S3 Object Store configuration (newest boto3 client)
+#
+type: boto3
+auth:
+  access_key: ...
+  secret_key: ...
+bucket:
+  name: unique_bucket_name_all_lowercase
+connection:  # not strictly needed but more of the API works with this.
+  region: us-east-1
+transfer:
+  multipart_threshold: 10000000
+  download_max_concurrency: 5
+  upload_max_concurrency: 10
+  # any of these options:
+  # multipart_threshold, max_concurrency, multipart_chunksize,
+  # num_download_attempts, max_io_queue, io_chunksize, use_threads,
+  # and max_bandwidth
+  # can be set. By default they will apply to uploads and downloads
+  # but they can be prefixed with upload_ or download_ as shown above
+  # to apply to just one scenario. More information about these parameters
+  # can be found at:
+  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig
+
+cache:
+  path: database/object_store_cache_s3
+  size: 1000
+  cache_updated_data: true
+extra_dirs:
+  - type: job_work
+    path: database/job_working_directory_s3
+
+
+
 #
-# Sample AWS S3 Object Store configuration
+# Sample AWS S3 Object Store configuration (legacy boto implementation)
 #
-
+# This implementation will use axel automatically for file transfers if it is on
+# Galaxy's path. Otherwise, it will use various python-based strategies for multi-part
+# upload of large uploads but all downloads will be single threaded.
 type: aws_s3
 auth:
   access_key: ...
@@ -147,6 +201,8 @@ bucket:
   name: unique_bucket_name_all_lowercase
   use_reduced_redundancy: false
   max_chunk_size: 250
+connection:  # not strictly needed but more of the API works with this.
+  region: us-east-1
 cache:
   path: database/object_store_cache_s3
   size: 1000
@@ -182,7 +238,32 @@ extra_dirs:
     path: database/job_working_directory_irods
 
 #
-# Sample non-AWS S3 Object Store (e.g. swift) configuration
+# Sample non-AWS S3 Object Store (e.g. swift) configuration (boto3)
+#
+
+type: boto3
+auth:
+  access_key: ...
+  secret_key: ...
+bucket:
+  name: unique_bucket_name_all_lowercase
+connection:
+  endpoint_url: https://swift.example.org:6000/
+  # region: some services may make use of region is specified.
+  # older style host, port, secure, and conn_path available to generic_s3 work
+  # here also - Galaxy will just infer a endpoint_url from those.
+cache:
+  path: database/object_store_cache_swift
+  size: 1000
+  cache_updated_data: true
+# transfer:   # see transfer options for boto3 above in AWS configuration.
+extra_dirs:
+  - type: job_work
+    path: database/job_working_directory_swift
+
+
+#
+# Sample non-AWS S3 Object Store (e.g. swift) configuration (legacy boto client)
 #
 
 type: generic_s3

diff --git a/lib/galaxy/dependencies/__init__.py b/lib/galaxy/dependencies/__init__.py
@@ -234,6 +234,9 @@ def check_python_pam(self):
     def check_azure_storage(self):
         return "azure_blob" in self.object_stores
 
+    def check_boto3(self):
+        return "boto3" in self.object_stores
+
     def check_kamaki(self):
         return "pithos" in self.object_stores
 

diff --git a/lib/galaxy/dependencies/dev-requirements.txt b/lib/galaxy/dependencies/dev-requirements.txt
@@ -10,6 +10,7 @@ babel==2.14.0 ; python_version >= "3.8" and python_version < "3.13"
 backports-tarfile==1.1.1 ; python_version >= "3.8" and python_version < "3.12"
 backports-zoneinfo==0.2.1 ; python_version >= "3.8" and python_version < "3.9"
 black==24.4.2 ; python_version >= "3.8" and python_version < "3.13"
+boto3==1.34.69 ; python_version >= "3.8" and python_version < "3.13"
 build==1.2.1 ; python_version >= "3.8" and python_version < "3.13"
 cachecontrol[filecache]==0.14.0 ; python_version >= "3.8" and python_version < "3.13"
 certifi==2024.2.2 ; python_version >= "3.8" and python_version < "3.13"

diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py
@@ -55,7 +55,10 @@
 from .caching import CacheTarget
 
 if TYPE_CHECKING:
-    from galaxy.model import DatasetInstance
+    from galaxy.model import (
+        Dataset,
+        DatasetInstance,
+    )
 
 NO_SESSION_ERROR_MESSAGE = (
     "Attempted to 'create' object store entity in configuration with no database session present."
@@ -373,16 +376,6 @@ def shutdown(self):
         """Close any connections for this ObjectStore."""
         self.running = False
 
-    def file_ready(
-        self, obj, base_dir=None, dir_only=False, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False
-    ):
-        """
-        Check if a file corresponding to a dataset is ready to be used.
-
-        Return True if so, False otherwise
-        """
-        return True
-
     @classmethod
     def parse_xml(clazz, config_xml):
         """Parse an XML description of a configuration for this object store.
@@ -938,10 +931,6 @@ def _exists(self, obj, **kwargs):
         """Determine if the `obj` exists in any of the backends."""
         return self._call_method("_exists", obj, False, False, **kwargs)
 
-    def file_ready(self, obj, **kwargs):
-        """Determine if the file for `obj` is ready to be used by any of the backends."""
-        return self._call_method("file_ready", obj, False, False, **kwargs)
-
     def _create(self, obj, **kwargs):
         """Create a backing file in a random backend."""
         objectstore = random.choice(list(self.backends.values()))
@@ -1400,6 +1389,10 @@ def type_to_object_store_class(store: str, fsmon: bool = False) -> Tuple[Type[Ba
     objectstore_constructor_kwds = {}
     if store == "disk":
         objectstore_class = DiskObjectStore
+    elif store == "boto3":
+        from .s3_boto3 import S3ObjectStore as Boto3ObjectStore
+
+        objectstore_class = Boto3ObjectStore
     elif store in ["s3", "aws_s3"]:
         from .s3 import S3ObjectStore
 
@@ -1672,18 +1665,27 @@ def persist_extra_files(
         if not extra_files_path_name:
             extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store)
         assert extra_files_path_name
-        for root, _dirs, files in safe_walk(src_extra_files_path):
-            extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path))
-            extra_dir = os.path.normpath(extra_dir)
-            for f in files:
-                if not in_directory(f, src_extra_files_path):
-                    # Unclear if this can ever happen if we use safe_walk ... probably not ?
-                    raise MalformedContents(f"Invalid dataset path: {f}")
-                object_store.update_from_file(
-                    primary_data.dataset,
-                    extra_dir=extra_dir,
-                    alt_name=f,
-                    file_name=os.path.join(root, f),
-                    create=True,
-                    preserve_symlinks=True,
-                )
+        persist_extra_files_for_dataset(object_store, src_extra_files_path, primary_data.dataset, extra_files_path_name)
+
+
+def persist_extra_files_for_dataset(
+    object_store: ObjectStore,
+    src_extra_files_path: str,
+    dataset: "Dataset",
+    extra_files_path_name: str,
+):
+    for root, _dirs, files in safe_walk(src_extra_files_path):
+        extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path))
+        extra_dir = os.path.normpath(extra_dir)
+        for f in files:
+            if not in_directory(f, src_extra_files_path):
+                # Unclear if this can ever happen if we use safe_walk ... probably not ?
+                raise MalformedContents(f"Invalid dataset path: {f}")
+            object_store.update_from_file(
+                dataset,
+                extra_dir=extra_dir,
+                alt_name=f,
+                file_name=os.path.join(root, f),
+                create=True,
+                preserve_symlinks=True,
+            )