Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

User-based ObjectStore #4840

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
f85fd4d
Add User-based ObjectStore features.
VJalili Dec 12, 2019
0be18ea
Drop AWS S3 support for User-Based ObjectStore:
VJalili Dec 16, 2019
8141f22
Remove the option of auto user media selection:
VJalili Dec 16, 2019
55d3839
Change how working directory paths are resolved.
VJalili Mar 6, 2020
3155b06
Use a property instead of a len check.
VJalili Mar 6, 2020
7d6a390
Pass user to the ObjectStore constructor rather than its methods.
VJalili Mar 6, 2020
608c4ab
Use the property instead of the len check.
VJalili Mar 6, 2020
d89b9de
Change the UBOS migration script number.
VJalili Mar 6, 2020
e25e994
Replace more count length with accessing property.
VJalili Mar 6, 2020
714c5f8
Add the missing migration script.
VJalili Mar 6, 2020
8474295
Fix issues related to failing tests.
VJalili Mar 6, 2020
bd046c8
Reduce user count to 2 in order for tests to pass on CircleCI.
VJalili Mar 7, 2020
8e1105c
Remove database prefix from UBOS path.
VJalili Mar 7, 2020
999cc0a
Merge remote-tracking branch 'upstream/dev' into UserBasedObjectStore2
VJalili May 22, 2020
54cfbb0
Increment migration script number.
VJalili May 23, 2020
93f71cc
Update to choose media at the _invoke method.
VJalili May 23, 2020
f760280
Remove `ignore_media`, and revert some cosmetic changes.
VJalili May 24, 2020
360b957
Remove configurations that are auto-defined and set to their defaults.
VJalili May 27, 2020
d524e4c
Add storage media path to the test_config_values unit test.
VJalili May 27, 2020
4d1395f
Update lib/galaxy/webapps/galaxy/api/storage_media.py
VJalili May 28, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions doc/source/admin/galaxy_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1150,6 +1150,58 @@
:Type: str


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``enable_user_based_object_store``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
Enables and disables the user-based object store feature.
:Default: ``false``
:Type: bool


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``default_storage_media_jobs_directory``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
Sets a base default jobs working directory for all users storage
media, where each storage media will have a sperate folder under
this path named with the media's encoded ID. This attribute is set
for each media independency in storage_media table; hence, admins
may modify records in that table to define user/media-specific
path.
:Default: ``job_working_directory_storage_media``
:Type: str


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``default_storage_media_cache_path``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
Sets a base default cache path for all users storage media, where
each storage media will have a separate folder under this path
named with the media's encoded ID. This attribute is set for each
media independency in storage_media table; hence, admins may
modify records in that table to define user/media-specific path.
:Default: ``storage_media_cache``
:Type: str


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``default_storage_media_cache_size``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
Sets a default cache size for all users storage media; in
Gigabytes. This attribute is set for each media independency in
storage_media table; hence, admins may modify records in that
table to define user/media-specific path.
:Default: ``100``
:Type: int


~~~~~~~~~~~~~~~
``smtp_server``
~~~~~~~~~~~~~~~
Expand Down
15 changes: 15 additions & 0 deletions lib/galaxy/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,21 @@ def _process_config(self, kwargs):
if not self.file_path_set and self.file_path.endswith('objects'):
self.object_store_store_by = 'uuid'
assert self.object_store_store_by in ['id', 'uuid'], "Invalid value for object_store_store_by [%s]" % self.object_store_store_by
self.object_store_cache_path = self.resolve_path(kwargs.get("object_store_cache_path", os.path.join(self.data_dir, "object_store_cache")))

# Configuration options for new storage media uses can plug-in.
self.enable_user_based_object_store = kwargs.get(
VJalili marked this conversation as resolved.
Show resolved Hide resolved
"enable_user_based_object_store", False)
self.default_storage_media_jobs_directory = self.resolve_path(kwargs.get(
VJalili marked this conversation as resolved.
Show resolved Hide resolved
"default_storage_media_jobs_directory",
os.path.join(self.data_dir, "job_working_directory_storage_media")))
self.default_storage_media_cache_path = self.resolve_path(kwargs.get(
VJalili marked this conversation as resolved.
Show resolved Hide resolved
"default_storage_media_cache_path",
os.path.join(self.data_dir, "storage_media_cache")))
self.default_storage_media_cache_size = kwargs.get(
"default_storage_media_cache_size",
VJalili marked this conversation as resolved.
Show resolved Hide resolved
100)

# Handle AWS-specific config options for backward compatibility
if kwargs.get('aws_access_key') is not None:
self.os_access_key = kwargs.get('aws_access_key')
Expand Down
23 changes: 23 additions & 0 deletions lib/galaxy/config/sample/galaxy.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,29 @@ galaxy:
# instance - but the default will be 'id' in many cases.
#object_store_store_by: null

# Enables and disables the user-based object store feature.
#enable_user_based_object_store: false

# Sets a base default jobs working directory for all users storage
# media, where each storage media will have a sperate folder under
# this path named with the media's encoded ID. This attribute is set
# for each media independency in storage_media table; hence, admins
# may modify records in that table to define user/media-specific path.
#default_storage_media_jobs_directory: job_working_directory_storage_media

# Sets a base default cache path for all users storage media, where
# each storage media will have a separate folder under this path named
# with the media's encoded ID. This attribute is set for each media
# independency in storage_media table; hence, admins may modify
# records in that table to define user/media-specific path.
#default_storage_media_cache_path: storage_media_cache

# Sets a default cache size for all users storage media; in Gigabytes.
# This attribute is set for each media independency in storage_media
# table; hence, admins may modify records in that table to define
# user/media-specific path.
#default_storage_media_cache_size: 100

# Galaxy sends mail for various things: subscribing users to the
# mailing list if they request it, password resets, reporting dataset
# errors, and sending activation emails. To do this, it needs to send
Expand Down
22 changes: 21 additions & 1 deletion lib/galaxy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,20 @@ def requires_containerization(self):
def use_metadata_binary(self):
return util.asbool(self.get_destination_configuration('use_metadata_binary', "False"))

def __assign_media(self, job, dataset):
if self.app.config.enable_user_based_object_store and job.user:
all_user_media = job.user.active_storage_media
if job.history is None:
is_history_shared = False
else:
is_history_shared = self.sa_session.query(
self.app.model.HistoryUserShareAssociation).filter_by(history_id=job.history.id).first() is not None
selected_media = model.StorageMedia.choose_media_for_association(
all_user_media,
history_shared=is_history_shared)
if selected_media is not None:
selected_media.associate_with_dataset(dataset)

def can_split(self):
# Should the job handler split this job up?
return self.app.config.use_tasked_jobs and self.tool.parallelism
Expand Down Expand Up @@ -1477,6 +1491,7 @@ def _set_object_store_ids(self, job):
# afterward. State below needs to happen the same way.
for dataset_assoc in job.output_datasets + job.output_library_datasets:
dataset = dataset_assoc.dataset
self.__assign_media(job, dataset.dataset)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've spent months of my life trying to optimize this process of initializing the output datasets. Can we have some property on app that we can check to see if this method would ever doing anything - and skip it if there is no possibility of assigning media?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! this config property is added that if disabled, this method will not do anything. Would that be addressing your concerns?

object_store_populator.set_object_store_id(dataset)

job.object_store_id = object_store_populator.object_store_id
Expand Down Expand Up @@ -1736,7 +1751,12 @@ def fail():
for dataset_assoc in job.output_datasets:
if not dataset_assoc.dataset.dataset.purged:
dataset_assoc.dataset.dataset.set_total_size()
collected_bytes += dataset_assoc.dataset.dataset.get_total_size()
if not dataset_assoc.dataset.dataset.has_active_storage_media():
collected_bytes += dataset_assoc.dataset.dataset.get_total_size()
else:
for assoc in dataset_assoc.dataset.dataset.active_storage_media_associations:
assoc.storage_media.add_usage(dataset_assoc.dataset.dataset.get_total_size())
self.sa_session.flush()

if job.user:
job.user.adjust_total_disk_usage(collected_bytes)
Expand Down
9 changes: 8 additions & 1 deletion lib/galaxy/jobs/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,14 @@ def __verify_job_ready(self, job, job_wrapper):

if state == JOB_READY:
state = self.__check_user_jobs(job, job_wrapper)
if state == JOB_READY and self.app.config.enable_quotas:
# If user has plugged a media, then they might have enough quota
# on their media; hence, we should not raise the "over quota" flag
# checking the default storage only. If their usage exceeds their
# total quota on all their media, ObjectStore raises appropriate
# exception(s).
if state == JOB_READY and self.app.config.enable_quotas and \
(job.user is not None and
(job.user.active_storage_media is None or not job.user.has_active_storage_media())):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be redone on top of #10221 - which I think abstracts out the quota checking into nice optimizable functions. I think rather than checking if has_active_storage_media we should build on the abstractions in that PR to just ask if the configured objectstore we're talking to has quota left and then we can disable quota on objectstores that use storage media. The ability to disable quota on an objectstore is included in that PR.

quota = self.app.quota_agent.get_quota(job.user)
if quota is not None:
try:
Expand Down
8 changes: 7 additions & 1 deletion lib/galaxy/managers/hdas.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,13 @@ def purge(self, hda, flush=True):
super(HDAManager, self).purge(hda, flush=flush)
# decrease the user's space used
if quota_amount_reduction:
user.adjust_total_disk_usage(-quota_amount_reduction)
if not hda.dataset.has_active_storage_media():
user.adjust_total_disk_usage(-quota_amount_reduction)
else:
for assoc in hda.dataset.active_storage_media_associations:
VJalili marked this conversation as resolved.
Show resolved Hide resolved
assoc.storage_media.add_usage(-quota_amount_reduction)
if flush:
self.session().flush()
return hda

# .... states
Expand Down
161 changes: 161 additions & 0 deletions lib/galaxy/managers/storage_media.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""
Manager and Serializer for storage media.
"""

import logging

from galaxy import exceptions
from galaxy import model
from galaxy.managers import (
base,
datasets,
deletable,
hdas,
sharable
)

log = logging.getLogger(__name__)


class StorageMediaManager(base.ModelManager, deletable.PurgableManagerMixin):

model_class = model.StorageMedia
foreign_key_name = "storage_media"

def __init__(self, app, *args, **kwargs):
super(StorageMediaManager, self).__init__(app, *args, **kwargs)
self.hda_manager = hdas.HDAManager(app)
self.dataset_manager = datasets.DatasetManager(app)

def delete(self, storage_media, **kwargs):
"""
Deletes the given storage media by taking the following steps:
(1) marks the storage media `deleted` in the database (i.e., setting
the `deleted` attribute to True);
(2) marks `deleted` all the datasets persisted on the storage media;
(3) marks `deleted` all the StorageMedia-Dataset associations.
:param storage_media: The storage media to be deleted.
:type storage_media: galaxy.model.StorageMedia
:return: returns the deleted storage media.
"""
super(StorageMediaManager, self).delete(storage_media, kwargs)
for assoc in storage_media.data_association:
self.hda_manager.delete(assoc, kwargs)
self.dataset_manager.delete(assoc.dataset, kwargs)
super(StorageMediaManager, self).delete(assoc, kwargs)
self.session().flush()
return storage_media

def undelete(self, storage_media, **kwargs):
"""
Un-deletes the given storage media by taking the following steps:
(1) marks the storage media `un-deleted` in the database (i.e., setting
the `deleted` attribute to False);
(2) marks `un-deleted` all the datasets persisted on the storage media;
(3) marks `un-deleted` all the StorageMedia-Dataset associations.
:param storage_media: The storage media to be deleted.
:type storage_media: galaxy.model.StorageMedia
:return: returns the deleted storage media.
"""
super(StorageMediaManager, self).undelete(storage_media, kwargs)
for assoc in storage_media.data_association:
self.hda_manager.delete(assoc, kwargs)
self.dataset_manager.delete(assoc.dataset, kwargs)
super(StorageMediaManager, self).undelete(assoc, kwargs)
self.session().flush()
return storage_media

def purge(self, storage_media, **kwargs):
"""
Purges a storage media by taking the following steps:
(1) marks the storage media `purged` in the database;
(2) deletes all the datasets persisted on the storage media;
(3) marks all the HDAs associated with the deleted datasets as purged.
This operation does NOT `delete` the storage media physically
(e.g., it does not delete a S3 bucket), because the storage media
(e.g., a S3 bucket) may contain data other than those loaded
or mounted on Galaxy which deleting the media (e.g., deleting
a S3 bucket) will result in unexpected file deletes.
:param storage_media: The media to be purged.
:type: storage_media: galaxy.model.StorageMedia
:return: returns the purged storage media.
"""
if not storage_media.is_purgeable():
raise exceptions.ConfigDoesNotAllowException(
"The storage media (ID: `{}`; category: `{}`) is not purgeable; because {}".format(
storage_media.id, storage_media.category,
"it`s purgeable attribute is set to `False`." if storage_media.purgeable is False
else "it contains at least one dataset which is not purgeable."))
for i, assoc in enumerate(storage_media.data_association):
for hda in assoc.dataset.history_associations:
self.hda_manager.purge(hda)
self.dataset_manager.purge(assoc.dataset, storage_media=storage_media)
storage_media.data_association[i].purged = True
storage_media.purged = True
self.session().flush()
return storage_media


class StorageMediaSerializer(base.ModelSerializer, deletable.PurgableSerializerMixin):
"""
Interface/service object for serializing storage media into dictionaries.
"""
model_manager_class = StorageMediaManager

def __init__(self, app, **kwargs):
super(StorageMediaSerializer, self).__init__(app, **kwargs)
self.storage_media_manager = self.manager

self.default_view = "summary"
self.add_view("summary", [
"id",
"model_class",
"user_id",
"usage",
"category",
"path"
])
self.add_view("detailed", [
"id",
"model_class",
"user_id",
"create_time",
"update_time",
"usage",
"category",
"path",
"deleted",
"purged",
"purgeable"
])

def add_serializers(self):
super(StorageMediaSerializer, self).add_serializers()
deletable.PurgableSerializerMixin.add_serializers(self)

# Arguments of the following lambda functions:
# i : an instance of galaxy.model.StorageMedia.
# k : serialized dictionary key (e.g., "model_class", "category", and "path").
# **c: a dictionary containing "trans" and "user" objects.
self.serializers.update({
"id" : lambda i, k, **c: self.app.security.encode_id(i.id),
"model_class": lambda *a, **c: "StorageMedia",
"user_id" : lambda i, k, **c: self.app.security.encode_id(i.user_id),
"usage" : lambda i, k, **c: str(i.usage),
"category" : lambda i, k, **c: i.category,
"path" : lambda i, k, **c: i.path,
"deleted" : lambda i, k, **c: i.deleted,
"purged" : lambda i, k, **c: i.purged,
"purgeable" : lambda i, k, **c: i.purgeable
})


class StorageMediaDeserializer(sharable.SharableModelDeserializer, deletable.PurgableDeserializerMixin):

model_manager_class = StorageMediaManager

def add_deserializers(self):
super(StorageMediaDeserializer, self).add_deserializers()
self.deserializers.update({
"path": self.default_deserializer
})
Loading