Skip to content

Commit

Permalink
Implement paired_or_unpaired collections...
Browse files Browse the repository at this point in the history
  • Loading branch information
jmchilton committed Dec 20, 2024
1 parent f31845a commit 89b74fe
Show file tree
Hide file tree
Showing 20 changed files with 574 additions and 24 deletions.
81 changes: 80 additions & 1 deletion lib/galaxy/model/dataset_collections/matching.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from typing import Optional
from typing import (
Optional,
TYPE_CHECKING,
)

from galaxy import exceptions
from galaxy.util import bunch
Expand All @@ -7,9 +10,85 @@
leaf,
)

if TYPE_CHECKING:
from galaxy.model import DatasetCollectionElement


CANNOT_MATCH_ERROR_MESSAGE = "Cannot match collection types."


class CollectionAdapter:
# wrap model objects with extra context to create psuedo or ephemeral
# collections for tool processing code. Used across tool actions and
# tool evaluation.

@property
def dataset_action_tuples(self):
raise NotImplementedError()

@property
def dataset_states_and_extensions_summary(self):
raise NotImplementedError()

@property
def dataset_instances(self):
raise NotImplementedError()

@property
def elements(self):
raise NotImplementedError()

def adapter_json(self):
# json kwds to recover state from database after the job has been
# recorded
raise NotImplementedError()


class DCECollectionAdapter(CollectionAdapter):
# adapt a DatasetCollectionElement to act as collection.
_dce: "DatasetCollectionElement"

def __init__(self, dataset_collection_element: "DatasetCollectionElement"):
self._dce = dataset_collection_element

@property
def dataset_action_tuples(self):
hda = self._dce.dataset_instance
return [(permission.action, permission.role_id) for permission in hda.dataset.actions]

@property
def dataset_states_and_extensions_summary(self):
hda = self._dce.dataset_instance
extensions = set()
states = set()
states.add(hda.dataset.state)
extensions.add(hda.extension)
return (states, extensions)

@property
def dataset_instances(self):
return self._dce.dataset_instances

@property
def elements(self):
return [self._dce]


class SingletonCollection(DCECollectionAdapter):
# allow a singleton list element to act as paired_or_unpaired collection

def adapter_json(self):
return {"type": "singleton"}


def recover_adapter(wrapped_object, adapter_json):
adapter_type = adapter_json["type"]
if adapter_type == "singleton":
return SingletonCollection(wrapped_object)
else:
raise Exception(f"Unknown collection adapter encountered {adapter_type}")


class CollectionsToMatch:
"""Structure representing a set of collections that need to be matched up
when running tools (possibly workflows in the future as well).
Expand Down
2 changes: 2 additions & 0 deletions lib/galaxy/model/dataset_collections/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
from .types import (
list,
paired,
paired_or_unpaired,
record,
)

PLUGIN_CLASSES = [
list.ListDatasetCollectionType,
paired.PairedDatasetCollectionType,
record.RecordDatasetCollectionType,
paired_or_unpaired.PairedOrUnpairedDatasetCollectionType,
]


Expand Down
9 changes: 6 additions & 3 deletions lib/galaxy/model/dataset_collections/structure.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
""" Module for reasoning about structure of and matching hierarchical collections of data.
"""

import logging
from typing import TYPE_CHECKING

log = logging.getLogger(__name__)
if TYPE_CHECKING:
from .type_description import CollectionTypeDescription


class Leaf:
Expand Down Expand Up @@ -190,7 +191,9 @@ def dict_map(func, input_dict):
return {k: func(v) for k, v in input_dict.items()}


def get_structure(dataset_collection_instance, collection_type_description, leaf_subcollection_type=None):
def get_structure(
dataset_collection_instance, collection_type_description: "CollectionTypeDescription", leaf_subcollection_type=None
):
if leaf_subcollection_type:
collection_type_description = collection_type_description.effective_collection_type_description(
leaf_subcollection_type
Expand Down
17 changes: 16 additions & 1 deletion lib/galaxy/model/dataset_collections/subcollections.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,33 @@
from galaxy import exceptions
from .matching import SingletonCollection


def split_dataset_collection_instance(dataset_collection_instance, collection_type):
"""Split up collection into collection."""
return _split_dataset_collection(dataset_collection_instance.collection, collection_type)


def _is_a_subcollection_type(this_collection_type: str, collection_type: str):
if collection_type == "single_datasets":
# can be a subcollection of anything effectively...
return True
if not this_collection_type.endswith(collection_type) or this_collection_type == collection_type:
return False
return True


def _split_dataset_collection(dataset_collection, collection_type):
this_collection_type = dataset_collection.collection_type
if not this_collection_type.endswith(collection_type) or this_collection_type == collection_type:

if not _is_a_subcollection_type(this_collection_type, collection_type):
raise exceptions.MessageException("Cannot split collection in desired fashion.")

split_elements = []
for element in dataset_collection.elements:
if collection_type == "single_datasets":
split_elements.append(SingletonCollection(element))
continue

child_collection = element.child_collection
if child_collection is None:
raise exceptions.MessageException("Cannot split collection in desired fashion.")
Expand Down
27 changes: 23 additions & 4 deletions lib/galaxy/model/dataset_collections/type_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def effective_collection_type(self, subcollection_type):

return self.collection_type[: -(len(subcollection_type) + 1)]

def has_subcollections_of_type(self, other_collection_type):
def has_subcollections_of_type(self, other_collection_type) -> bool:
"""Take in another type (either flat string or another
CollectionTypeDescription) and determine if this collection contains
subcollections matching that type.
Expand All @@ -65,18 +65,37 @@ def has_subcollections_of_type(self, other_collection_type):
if hasattr(other_collection_type, "collection_type"):
other_collection_type = other_collection_type.collection_type
collection_type = self.collection_type
return collection_type.endswith(other_collection_type) and collection_type != other_collection_type
if collection_type == other_collection_type:
return False
if collection_type.endswith(other_collection_type):
return True
if other_collection_type == "paired_or_unpaired":
# this can be thought of as a subcollection of anything except a pair
# since it would match a pair exactly
return collection_type != "paired"
if other_collection_type == "single_datasets":
# effectively any collection has unpaired subcollections
return True
return False

def is_subcollection_of_type(self, other_collection_type):
if not hasattr(other_collection_type, "collection_type"):
other_collection_type = self.collection_type_description_factory.for_collection_type(other_collection_type)
return other_collection_type.has_subcollections_of_type(self)

def can_match_type(self, other_collection_type):
def can_match_type(self, other_collection_type) -> bool:
if hasattr(other_collection_type, "collection_type"):
other_collection_type = other_collection_type.collection_type
collection_type = self.collection_type
return other_collection_type == collection_type
if other_collection_type == collection_type:
return True
elif other_collection_type == "paired" and collection_type == "paired_or_unpaired":
return True
elif other_collection_type == "paired_or_unpaired" and collection_type == "paired":
return True

# can we push this to the type registry somehow?
return False

def subcollection_type_description(self):
if not self.__has_subcollections:
Expand Down
8 changes: 8 additions & 0 deletions lib/galaxy/model/dataset_collections/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ def generate_elements(self, dataset_instances: dict, **kwds):
class BaseDatasetCollectionType(DatasetCollectionType):
def _validation_failed(self, message):
raise exceptions.ObjectAttributeInvalidException(message)

def _ensure_dataset_with_identifier(self, dataset_instances: dict, name: str):
dataset_instance = dataset_instances.get(name)
if dataset_instance is None:
raise exceptions.ObjectAttributeInvalidException(
f"An element with the identifier {name} is required to create this collection type"
)
return dataset_instance
11 changes: 9 additions & 2 deletions lib/galaxy/model/dataset_collections/types/paired.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from galaxy.exceptions import RequestParameterInvalidException
from galaxy.model import (
DatasetCollectionElement,
HistoryDatasetAssociation,
Expand All @@ -16,13 +17,19 @@ class PairedDatasetCollectionType(BaseDatasetCollectionType):
collection_type = "paired"

def generate_elements(self, dataset_instances, **kwds):
if forward_dataset := dataset_instances.get(FORWARD_IDENTIFIER):
num_datasets = len(dataset_instances)
if num_datasets != 2:
raise RequestParameterInvalidException(
f"Incorrect number of datasets - 2 datasets exactly are required to create a single_or_paired collection"
)

if forward_dataset := self._ensure_dataset_with_identifier(dataset_instances, FORWARD_IDENTIFIER):
left_association = DatasetCollectionElement(
element=forward_dataset,
element_identifier=FORWARD_IDENTIFIER,
)
yield left_association
if reverse_dataset := dataset_instances.get(REVERSE_IDENTIFIER):
if reverse_dataset := self._ensure_dataset_with_identifier(dataset_instances, REVERSE_IDENTIFIER):
right_association = DatasetCollectionElement(
element=reverse_dataset,
element_identifier=REVERSE_IDENTIFIER,
Expand Down
46 changes: 46 additions & 0 deletions lib/galaxy/model/dataset_collections/types/paired_or_unpaired.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from galaxy.exceptions import RequestParameterInvalidException
from galaxy.model import (
DatasetCollectionElement,
HistoryDatasetAssociation,
)
from . import BaseDatasetCollectionType
from .paired import (
FORWARD_IDENTIFIER,
REVERSE_IDENTIFIER,
)

SINGLETON_IDENTIFIER = "unpaired"


class PairedOrUnpairedDatasetCollectionType(BaseDatasetCollectionType):
""" """

collection_type = "paired_or_unpaired"

def generate_elements(self, dataset_instances, **kwds):
num_datasets = len(dataset_instances)
if num_datasets > 2 or num_datasets < 1:
raise RequestParameterInvalidException(
f"Incorrect number of datasets - 1 or 2 datasets is required to create a paired_or_unpaired collection"
)

if num_datasets == 2:
if forward_dataset := self._ensure_dataset_with_identifier(dataset_instances, FORWARD_IDENTIFIER):
left_association = DatasetCollectionElement(
element=forward_dataset,
element_identifier=FORWARD_IDENTIFIER,
)
yield left_association
if reverse_dataset := self._ensure_dataset_with_identifier(dataset_instances, REVERSE_IDENTIFIER):
right_association = DatasetCollectionElement(
element=reverse_dataset,
element_identifier=REVERSE_IDENTIFIER,
)
yield right_association
else:
if single_datasets := self._ensure_dataset_with_identifier(dataset_instances, SINGLETON_IDENTIFIER):
single_association = DatasetCollectionElement(
element=single_datasets,
element_identifier=SINGLETON_IDENTIFIER,
)
yield single_association
2 changes: 2 additions & 0 deletions lib/galaxy/schema/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from typing_extensions import (
Annotated,
Literal,
NotRequired,
TypedDict,
)

from galaxy.schema import partial_model
Expand Down
57 changes: 57 additions & 0 deletions lib/galaxy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3534,6 +3534,63 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history
)


class SplitPairedAndUnpairedTool(DatabaseOperationTool):
tool_type = "split_paired_and_unpaired"
require_terminal_states = False
require_dataset_ok = False

def produce_outputs(self, trans, out_data, output_collections, incoming, history, **kwds):
has_collection = incoming["input"]
if hasattr(has_collection, "element_type"):
# It is a DCE
collection = has_collection.element_object
else:
# It is an HDCA
collection = has_collection.collection

collection_type = collection.collection_type
assert collection_type in ["list", "list:paired", "list:paired_or_unpaired"]

unpaired_dce_copies = {}
paired_dce_copies = {}
paired_datasets = []

def _handle_unpaired(dce):
element_identifier = dce.element_identifier
assert getattr(dce.element_object, "history_content_type", None) == "dataset"
copied_value = dce.element_object.copy(copy_tags=dce.element_object.tags, flush=False)
unpaired_dce_copies[element_identifier] = copied_value

def _handle_paired(dce):
element_identifier = dce.element_identifier
copied_value = dce.element_object.copy(flush=False)
paired_dce_copies[element_identifier] = copied_value
paired_datasets.append(copied_value.elements[0].element_object)
paired_datasets.append(copied_value.elements[1].element_object)

if collection_type == "list":
for element in collection.elements:
_handle_unpaired(element)
elif collection_type == "list:paired":
for element in collection.elements:
_handle_paired(element)
elif collection_type == "list:paired_or_unpaired":
for element in collection.elements:
if getattr(element.element_object, "history_content_type", None) == "dataset":
_handle_unpaired(element)
else:
_handle_paired(element)

self._add_datasets_to_history(history, unpaired_dce_copies.values())
self._add_datasets_to_history(history, paired_datasets)
output_collections.create_collection(
self.outputs["output_unpaired"], "output_unpaired", elements=unpaired_dce_copies, propagate_hda_tags=False
)
output_collections.create_collection(
self.outputs["output_paired"], "output_paired", elements=paired_dce_copies, propagate_hda_tags=False
)


class ExtractDatasetCollectionTool(DatabaseOperationTool):
tool_type = "extract_dataset"
require_terminal_states = False
Expand Down
Loading

0 comments on commit 89b74fe

Please sign in to comment.