Skip to content

Commit

Permalink
Plumbing for dereferencing URLs into datasets.
Browse files Browse the repository at this point in the history
  • Loading branch information
jmchilton committed Sep 26, 2024
1 parent fbad7c7 commit 9b7e3cb
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 4 deletions.
15 changes: 15 additions & 0 deletions lib/galaxy/managers/hdas.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,15 @@
taggable,
users,
)
from galaxy.managers.context import ProvidesHistoryContext
from galaxy.model import (
Job,
JobStateHistory,
JobToOutputDatasetAssociation,
)
from galaxy.model.base import transaction
from galaxy.model.deferred import materializer_factory
from galaxy.model.dereference import dereference_to_model
from galaxy.schema.schema import DatasetSourceType
from galaxy.schema.storage_cleaner import (
CleanableItemsSummary,
Expand All @@ -68,6 +70,7 @@
MinimalManagerApp,
StructuredApp,
)
from galaxy.tool_util.parameters import DataRequestUri
from galaxy.util.compression_utils import get_fileobj

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -343,6 +346,18 @@ def _set_permissions(self, trans, hda, role_ids_dict):
raise exceptions.RequestParameterInvalidException(error)


def dereference_input(
trans: ProvidesHistoryContext, data_request: DataRequestUri, history: Optional[model.History] = None
) -> model.HistoryDatasetAssociation:
target_history = history or trans.history
hda = dereference_to_model(trans.sa_session, trans.user, target_history, data_request)
permissions = trans.app.security_agent.history_get_default_permissions(target_history)
trans.app.security_agent.set_all_dataset_permissions(hda.dataset, permissions, new=True, flush=False)
with transaction(trans.sa_session):
trans.sa_session.commit()
return hda


class HDAStorageCleanerManager(base.StorageCleanerManager):
def __init__(self, hda_manager: HDAManager, dataset_manager: datasets.DatasetManager):
self.hda_manager = hda_manager
Expand Down
15 changes: 11 additions & 4 deletions lib/galaxy/model/deferred.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def ensure_materialized(
self,
dataset_instance: Union[HistoryDatasetAssociation, LibraryDatasetDatasetAssociation],
target_history: Optional[History] = None,
in_place: bool = False,
) -> HistoryDatasetAssociation:
"""Create a new detached dataset instance from the supplied instance.
Expand Down Expand Up @@ -148,10 +149,16 @@ def ensure_materialized(
history = dataset_instance.history
except DetachedInstanceError:
history = None
materialized_dataset_instance = HistoryDatasetAssociation(
create_dataset=False, # is the default but lets make this really clear...
history=history,
)

materialized_dataset_instance: HistoryDatasetAssociation
if not in_place:
materialized_dataset_instance = HistoryDatasetAssociation(
create_dataset=False, # is the default but lets make this really clear...
history=history,
)
else:
assert isinstance(dataset_instance, HistoryDatasetAssociation)
materialized_dataset_instance = cast(HistoryDatasetAssociation, dataset_instance)
if attached:
sa_session = self._sa_session
if sa_session is None:
Expand Down
46 changes: 46 additions & 0 deletions lib/galaxy/model/dereference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os.path
from typing import List

from galaxy.model import (
DatasetSource,
DatasetSourceHash,
HistoryDatasetAssociation,
TransformAction,
)
from galaxy.tool_util.parameters import DataRequestUri


def dereference_to_model(sa_session, user, history, data_request_uri: DataRequestUri) -> HistoryDatasetAssociation:
name = data_request_uri.name or os.path.basename(data_request_uri.url)
dbkey = data_request_uri.dbkey or "?"
hda = HistoryDatasetAssociation(
name=name,
extension=data_request_uri.ext,
dbkey=dbkey, # TODO
history=history,
create_dataset=True,
sa_session=sa_session,
)
hda.state = hda.states.DEFERRED
dataset_source = DatasetSource()
dataset_source.source_uri = data_request_uri.url
hashes = []
for dataset_hash in data_request_uri.hashes or []:
hash_object = DatasetSourceHash()
hash_object.hash_function = dataset_hash.hash_function
hash_object.hash_value = dataset_hash.hash_value
hashes.append(hash_object)
dataset_source.hashes = hashes
hda.dataset.sources = [dataset_source]
transform: List[TransformAction] = []
if data_request_uri.space_to_tab:
transform.append({"action": "space_to_tab"})
elif data_request_uri.to_posix_lines:
transform.append({"action": "to_posix_lines"})
if len(transform) > 0:
dataset_source.transform = transform

sa_session.add(hda)
sa_session.add(dataset_source)
history.add_dataset(hda, genome_build=dbkey, quota=False)
return hda
57 changes: 57 additions & 0 deletions test/unit/data/test_dereference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from base64 import b64encode

from galaxy.model.dereference import dereference_to_model
from galaxy.tool_util.parameters import DataRequestUri
from .model.test_model_store import setup_fixture_context_with_history

B64_FOR_1_2_3 = b64encode(b"1 2 3").decode("utf-8")
TEST_URI = "gxfiles://test/1.bed"
TEST_BASE64_URI = f"base64://{B64_FOR_1_2_3}"


def test_dereference():
app, sa_session, user, history = setup_fixture_context_with_history()
uri_request = DataRequestUri(url=TEST_URI, ext="bed")
hda = dereference_to_model(sa_session, user, history, uri_request)
assert hda.name == "1.bed"
assert hda.dataset.sources[0].source_uri == TEST_URI
assert hda.ext == "bed"


def test_dereference_dbkey():
app, sa_session, user, history = setup_fixture_context_with_history()
uri_request = DataRequestUri(url=TEST_URI, ext="bed", dbkey="hg19")
hda = dereference_to_model(sa_session, user, history, uri_request)
assert hda.name == "1.bed"
assert hda.dataset.sources[0].source_uri == TEST_URI
assert hda.dbkey == "hg19"


def test_dereference_md5():
app, sa_session, user, history = setup_fixture_context_with_history()
md5 = "f2b33fb7b3d0eb95090a16060e6a24f9"
uri_request = DataRequestUri.model_validate(
{
"url": TEST_BASE64_URI,
"name": "foobar.txt",
"ext": "txt",
"hashes": [{"hash_function": "MD5", "hash_value": md5}],
}
)
hda = dereference_to_model(sa_session, user, history, uri_request)
assert hda.name == "foobar.txt"
assert hda.dataset.sources[0].source_uri == TEST_BASE64_URI
assert hda.dataset.sources[0].hashes[0]
assert hda.dataset.sources[0].hashes[0].hash_function == "MD5"
assert hda.dataset.sources[0].hashes[0].hash_value == md5


def test_dereference_to_posix():
app, sa_session, user, history = setup_fixture_context_with_history()
uri_request = DataRequestUri.model_validate(
{"url": TEST_BASE64_URI, "name": "foobar.txt", "ext": "txt", "space_to_tab": True}
)
hda = dereference_to_model(sa_session, user, history, uri_request)
assert hda.name == "foobar.txt"
assert hda.dataset.sources[0].source_uri == TEST_BASE64_URI
assert hda.dataset.sources[0].transform[0]["action"] == "space_to_tab"

0 comments on commit 9b7e3cb

Please sign in to comment.