Skip to content

Commit

Permalink
Implement paired_or_unpaired collections...
Browse files Browse the repository at this point in the history
  • Loading branch information
jmchilton committed Dec 21, 2024
1 parent 8101d5b commit d783f92
Show file tree
Hide file tree
Showing 22 changed files with 744 additions and 27 deletions.
2 changes: 2 additions & 0 deletions lib/galaxy/model/dataset_collections/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
from .types import (
list,
paired,
paired_or_unpaired,
record,
)

PLUGIN_CLASSES = [
list.ListDatasetCollectionType,
paired.PairedDatasetCollectionType,
record.RecordDatasetCollectionType,
paired_or_unpaired.PairedOrUnpairedDatasetCollectionType,
]


Expand Down
11 changes: 7 additions & 4 deletions lib/galaxy/model/dataset_collections/structure.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
""" Module for reasoning about structure of and matching hierarchical collections of data.
"""

import logging
from typing import TYPE_CHECKING

log = logging.getLogger(__name__)
if TYPE_CHECKING:
from .type_description import CollectionTypeDescription


class Leaf:
Expand Down Expand Up @@ -149,7 +150,7 @@ def clone(self):
return Tree(cloned_children, self.collection_type_description)

def __str__(self):
return f"Tree[collection_type={self.collection_type_description},children={','.join(f'{identifier_and_element[0]}={identifier_and_element[1]}' for identifier_and_element in self.children)}]"
return f"Tree[collection_type={self.collection_type_description},children=({','.join(f'{identifier_and_element[0]}={identifier_and_element[1]}' for identifier_and_element in self.children)})]"


def tool_output_to_structure(get_sliced_input_collection_structure, tool_output, collections_manager):
Expand Down Expand Up @@ -190,7 +191,9 @@ def dict_map(func, input_dict):
return {k: func(v) for k, v in input_dict.items()}


def get_structure(dataset_collection_instance, collection_type_description, leaf_subcollection_type=None):
def get_structure(
dataset_collection_instance, collection_type_description: "CollectionTypeDescription", leaf_subcollection_type=None
):
if leaf_subcollection_type:
collection_type_description = collection_type_description.effective_collection_type_description(
leaf_subcollection_type
Expand Down
17 changes: 16 additions & 1 deletion lib/galaxy/model/dataset_collections/subcollections.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,33 @@
from galaxy import exceptions
from .adapters import PromoteCollectionElementToCollectionAdapter


def split_dataset_collection_instance(dataset_collection_instance, collection_type):
"""Split up collection into collection."""
return _split_dataset_collection(dataset_collection_instance.collection, collection_type)


def _is_a_subcollection_type(this_collection_type: str, collection_type: str):
if collection_type == "single_datasets":
# can be a subcollection of anything effectively...
return True
if not this_collection_type.endswith(collection_type) or this_collection_type == collection_type:
return False
return True


def _split_dataset_collection(dataset_collection, collection_type):
this_collection_type = dataset_collection.collection_type
if not this_collection_type.endswith(collection_type) or this_collection_type == collection_type:
is_this_collection_nested = ":" in this_collection_type
if not _is_a_subcollection_type(this_collection_type, collection_type):
raise exceptions.MessageException("Cannot split collection in desired fashion.")

split_elements = []
for element in dataset_collection.elements:
if not is_this_collection_nested and collection_type == "single_datasets":
split_elements.append(PromoteCollectionElementToCollectionAdapter(element))
continue

child_collection = element.child_collection
if child_collection is None:
raise exceptions.MessageException("Cannot split collection in desired fashion.")
Expand Down
30 changes: 26 additions & 4 deletions lib/galaxy/model/dataset_collections/type_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@ def effective_collection_type(self, subcollection_type):
if not self.has_subcollections_of_type(subcollection_type):
raise ValueError(f"Cannot compute effective subcollection type of {subcollection_type} over {self}")

if subcollection_type == "single_datasets":
return self.collection_type

return self.collection_type[: -(len(subcollection_type) + 1)]

def has_subcollections_of_type(self, other_collection_type):
def has_subcollections_of_type(self, other_collection_type) -> bool:
"""Take in another type (either flat string or another
CollectionTypeDescription) and determine if this collection contains
subcollections matching that type.
Expand All @@ -65,18 +68,37 @@ def has_subcollections_of_type(self, other_collection_type):
if hasattr(other_collection_type, "collection_type"):
other_collection_type = other_collection_type.collection_type
collection_type = self.collection_type
return collection_type.endswith(other_collection_type) and collection_type != other_collection_type
if collection_type == other_collection_type:
return False
if collection_type.endswith(other_collection_type):
return True
if other_collection_type == "paired_or_unpaired":
# this can be thought of as a subcollection of anything except a pair
# since it would match a pair exactly
return collection_type != "paired"
if other_collection_type == "single_datasets":
# effectively any collection has unpaired subcollections
return True
return False

def is_subcollection_of_type(self, other_collection_type):
if not hasattr(other_collection_type, "collection_type"):
other_collection_type = self.collection_type_description_factory.for_collection_type(other_collection_type)
return other_collection_type.has_subcollections_of_type(self)

def can_match_type(self, other_collection_type):
def can_match_type(self, other_collection_type) -> bool:
if hasattr(other_collection_type, "collection_type"):
other_collection_type = other_collection_type.collection_type
collection_type = self.collection_type
return other_collection_type == collection_type
if other_collection_type == collection_type:
return True
elif other_collection_type == "paired" and collection_type == "paired_or_unpaired":
return True
elif other_collection_type == "paired_or_unpaired" and collection_type == "paired":
return True

# can we push this to the type registry somehow?
return False

def subcollection_type_description(self):
if not self.__has_subcollections:
Expand Down
8 changes: 8 additions & 0 deletions lib/galaxy/model/dataset_collections/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ def generate_elements(self, dataset_instances: dict, **kwds):
class BaseDatasetCollectionType(DatasetCollectionType):
def _validation_failed(self, message):
raise exceptions.ObjectAttributeInvalidException(message)

def _ensure_dataset_with_identifier(self, dataset_instances: dict, name: str):
dataset_instance = dataset_instances.get(name)
if dataset_instance is None:
raise exceptions.ObjectAttributeInvalidException(
f"An element with the identifier {name} is required to create this collection type"
)
return dataset_instance
11 changes: 9 additions & 2 deletions lib/galaxy/model/dataset_collections/types/paired.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from galaxy.exceptions import RequestParameterInvalidException
from galaxy.model import (
DatasetCollectionElement,
HistoryDatasetAssociation,
Expand All @@ -16,13 +17,19 @@ class PairedDatasetCollectionType(BaseDatasetCollectionType):
collection_type = "paired"

def generate_elements(self, dataset_instances, **kwds):
if forward_dataset := dataset_instances.get(FORWARD_IDENTIFIER):
num_datasets = len(dataset_instances)
if num_datasets != 2:
raise RequestParameterInvalidException(
f"Incorrect number of datasets - 2 datasets exactly are required to create a single_or_paired collection"
)

if forward_dataset := self._ensure_dataset_with_identifier(dataset_instances, FORWARD_IDENTIFIER):
left_association = DatasetCollectionElement(
element=forward_dataset,
element_identifier=FORWARD_IDENTIFIER,
)
yield left_association
if reverse_dataset := dataset_instances.get(REVERSE_IDENTIFIER):
if reverse_dataset := self._ensure_dataset_with_identifier(dataset_instances, REVERSE_IDENTIFIER):
right_association = DatasetCollectionElement(
element=reverse_dataset,
element_identifier=REVERSE_IDENTIFIER,
Expand Down
46 changes: 46 additions & 0 deletions lib/galaxy/model/dataset_collections/types/paired_or_unpaired.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from galaxy.exceptions import RequestParameterInvalidException
from galaxy.model import (
DatasetCollectionElement,
HistoryDatasetAssociation,
)
from . import BaseDatasetCollectionType
from .paired import (
FORWARD_IDENTIFIER,
REVERSE_IDENTIFIER,
)

SINGLETON_IDENTIFIER = "unpaired"


class PairedOrUnpairedDatasetCollectionType(BaseDatasetCollectionType):
""" """

collection_type = "paired_or_unpaired"

def generate_elements(self, dataset_instances, **kwds):
num_datasets = len(dataset_instances)
if num_datasets > 2 or num_datasets < 1:
raise RequestParameterInvalidException(
f"Incorrect number of datasets - 1 or 2 datasets is required to create a paired_or_unpaired collection"
)

if num_datasets == 2:
if forward_dataset := self._ensure_dataset_with_identifier(dataset_instances, FORWARD_IDENTIFIER):
left_association = DatasetCollectionElement(
element=forward_dataset,
element_identifier=FORWARD_IDENTIFIER,
)
yield left_association
if reverse_dataset := self._ensure_dataset_with_identifier(dataset_instances, REVERSE_IDENTIFIER):
right_association = DatasetCollectionElement(
element=reverse_dataset,
element_identifier=REVERSE_IDENTIFIER,
)
yield right_association
else:
if single_datasets := self._ensure_dataset_with_identifier(dataset_instances, SINGLETON_IDENTIFIER):
single_association = DatasetCollectionElement(
element=single_datasets,
element_identifier=SINGLETON_IDENTIFIER,
)
yield single_association
2 changes: 2 additions & 0 deletions lib/galaxy/schema/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from typing_extensions import (
Annotated,
Literal,
NotRequired,
TypedDict,
)

from galaxy.schema import partial_model
Expand Down
74 changes: 74 additions & 0 deletions lib/galaxy/tool_util/parameters/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
StrictInt,
StrictStr,
Tag,
TypeAdapter,
ValidationError,
)
from typing_extensions import (
Expand Down Expand Up @@ -329,6 +330,7 @@ def request_requires_value(self) -> bool:

DataSrcT = Literal["hda", "ldda"]
MultiDataSrcT = Literal["hda", "ldda", "hdca"]
# @jmchilton you meant CollectionSrcT - fix that at some point please.
CollectionStrT = Literal["hdca"]

TestCaseDataSrcT = Literal["File"]
Expand Down Expand Up @@ -527,6 +529,78 @@ class DataCollectionRequestInternal(StrictModel):
id: StrictInt


CollectionAdapterSrcT = Literal["CollectionAdapter"]


class AdaptedDataCollectionRequestBase(StrictModel):
src: CollectionAdapterSrcT


class AdaptedDataCollectionPromoteDatasetToCollectionRequest(AdaptedDataCollectionRequestBase):
adapter_type: Literal["PromoteDatasetToCollection"]
collection_type: Literal["list", "paired_or_unpaired"]
adapting: DataRequestHda


# calling this name and element_identifier to align with fetch API, etc...
class AdapterElementRequest(DataRequestHda):
name: str # element_identifier


class AdaptedDataCollectionPromoteDatasetsToCollectionRequest(AdaptedDataCollectionRequestBase):
adapter_type: Literal["PromoteDatasetsToCollection"]
# could allow list in here without changing much else I think but I'm trying to keep these tight in scope
collection_type: Literal["paired", "paired_or_unpaired"]
adapting: List[AdapterElementRequest]


AdaptedDataCollectionRequest = Annotated[
Union[
AdaptedDataCollectionPromoteDatasetToCollectionRequest, AdaptedDataCollectionPromoteDatasetsToCollectionRequest
],
Field(discriminator="adapter_type"),
]
AdaptedDataCollectionRequestTypeAdapter = TypeAdapter(AdaptedDataCollectionRequest)


class DatasetCollectionElementReference(StrictModel):
src: Literal["dce"]
id: StrictInt


class AdaptedDataCollectionPromoteCollectionElementToCollectionRequestInternal(AdaptedDataCollectionRequestBase):
adapter_type: Literal["PromoteCollectionElementToCollection"]
adapting: DatasetCollectionElementReference


class AdaptedDataCollectionPromoteDatasetToCollectionRequestInternal(AdaptedDataCollectionRequestBase):
adapter_type: Literal["PromoteDatasetToCollection"]
collection_type: Literal["list", "paired_or_unpaired"]
adapting: DataRequestInternalHda


class AdapterElementRequestInternal(DataRequestInternalHda):
name: str # element_identifier


class AdaptedDataCollectionPromoteDatasetsToCollectionRequestInternal(AdaptedDataCollectionRequestBase):
adapter_type: Literal["PromoteDatasetsToCollection"]
# could allow list in here without changing much else I think but I'm trying to keep these tight in scope
collection_type: Literal["paired", "paired_or_unpaired"]
adapting: List[AdapterElementRequestInternal]


AdaptedDataCollectionRequestInternal = Annotated[
Union[
AdaptedDataCollectionPromoteCollectionElementToCollectionRequestInternal,
AdaptedDataCollectionPromoteDatasetToCollectionRequestInternal,
AdaptedDataCollectionPromoteDatasetsToCollectionRequestInternal,
],
Field(discriminator="adapter_type"),
]
AdaptedDataCollectionRequestInternalTypeAdapter = TypeAdapter(AdaptedDataCollectionRequestInternal)


class DataCollectionParameterModel(BaseGalaxyToolParameterModelDefinition):
parameter_type: Literal["gx_data_collection"] = "gx_data_collection"
collection_type: Optional[str] = None
Expand Down
57 changes: 57 additions & 0 deletions lib/galaxy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3534,6 +3534,63 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history
)


class SplitPairedAndUnpairedTool(DatabaseOperationTool):
tool_type = "split_paired_and_unpaired"
require_terminal_states = False
require_dataset_ok = False

def produce_outputs(self, trans, out_data, output_collections, incoming, history, **kwds):
has_collection = incoming["input"]
if hasattr(has_collection, "element_type"):
# It is a DCE
collection = has_collection.element_object
else:
# It is an HDCA
collection = has_collection.collection

collection_type = collection.collection_type
assert collection_type in ["list", "list:paired", "list:paired_or_unpaired"]

unpaired_dce_copies = {}
paired_dce_copies = {}
paired_datasets = []

def _handle_unpaired(dce):
element_identifier = dce.element_identifier
assert getattr(dce.element_object, "history_content_type", None) == "dataset"
copied_value = dce.element_object.copy(copy_tags=dce.element_object.tags, flush=False)
unpaired_dce_copies[element_identifier] = copied_value

def _handle_paired(dce):
element_identifier = dce.element_identifier
copied_value = dce.element_object.copy(flush=False)
paired_dce_copies[element_identifier] = copied_value
paired_datasets.append(copied_value.elements[0].element_object)
paired_datasets.append(copied_value.elements[1].element_object)

if collection_type == "list":
for element in collection.elements:
_handle_unpaired(element)
elif collection_type == "list:paired":
for element in collection.elements:
_handle_paired(element)
elif collection_type == "list:paired_or_unpaired":
for element in collection.elements:
if getattr(element.element_object, "history_content_type", None) == "dataset":
_handle_unpaired(element)
else:
_handle_paired(element)

self._add_datasets_to_history(history, unpaired_dce_copies.values())
self._add_datasets_to_history(history, paired_datasets)
output_collections.create_collection(
self.outputs["output_unpaired"], "output_unpaired", elements=unpaired_dce_copies, propagate_hda_tags=False
)
output_collections.create_collection(
self.outputs["output_paired"], "output_paired", elements=paired_dce_copies, propagate_hda_tags=False
)


class ExtractDatasetCollectionTool(DatabaseOperationTool):
tool_type = "extract_dataset"
require_terminal_states = False
Expand Down
Loading

0 comments on commit d783f92

Please sign in to comment.