Implement paired_or_unpaired collections...

jmchilton · Dec 21, 2024 · d783f92 · d783f92
1 parent 8101d5b
commit d783f92
Show file tree

Hide file tree

Showing 22 changed files with 744 additions and 27 deletions.
diff --git a/lib/galaxy/model/dataset_collections/registry.py b/lib/galaxy/model/dataset_collections/registry.py
@@ -2,13 +2,15 @@
 from .types import (
     list,
     paired,
+    paired_or_unpaired,
     record,
 )
 
 PLUGIN_CLASSES = [
     list.ListDatasetCollectionType,
     paired.PairedDatasetCollectionType,
     record.RecordDatasetCollectionType,
+    paired_or_unpaired.PairedOrUnpairedDatasetCollectionType,
 ]
 
 

diff --git a/lib/galaxy/model/dataset_collections/structure.py b/lib/galaxy/model/dataset_collections/structure.py
@@ -1,9 +1,10 @@
 """ Module for reasoning about structure of and matching hierarchical collections of data.
 """
 
-import logging
+from typing import TYPE_CHECKING
 
-log = logging.getLogger(__name__)
+if TYPE_CHECKING:
+    from .type_description import CollectionTypeDescription
 
 
 class Leaf:
@@ -149,7 +150,7 @@ def clone(self):
         return Tree(cloned_children, self.collection_type_description)
 
     def __str__(self):
-        return f"Tree[collection_type={self.collection_type_description},children={','.join(f'{identifier_and_element[0]}={identifier_and_element[1]}' for identifier_and_element in self.children)}]"
+        return f"Tree[collection_type={self.collection_type_description},children=({','.join(f'{identifier_and_element[0]}={identifier_and_element[1]}' for identifier_and_element in self.children)})]"
 
 
 def tool_output_to_structure(get_sliced_input_collection_structure, tool_output, collections_manager):
@@ -190,7 +191,9 @@ def dict_map(func, input_dict):
     return {k: func(v) for k, v in input_dict.items()}
 
 
-def get_structure(dataset_collection_instance, collection_type_description, leaf_subcollection_type=None):
+def get_structure(
+    dataset_collection_instance, collection_type_description: "CollectionTypeDescription", leaf_subcollection_type=None
+):
     if leaf_subcollection_type:
         collection_type_description = collection_type_description.effective_collection_type_description(
             leaf_subcollection_type

diff --git a/lib/galaxy/model/dataset_collections/subcollections.py b/lib/galaxy/model/dataset_collections/subcollections.py
@@ -1,18 +1,33 @@
 from galaxy import exceptions
+from .adapters import PromoteCollectionElementToCollectionAdapter
 
 
 def split_dataset_collection_instance(dataset_collection_instance, collection_type):
     """Split up collection into collection."""
     return _split_dataset_collection(dataset_collection_instance.collection, collection_type)
 
 
+def _is_a_subcollection_type(this_collection_type: str, collection_type: str):
+    if collection_type == "single_datasets":
+        # can be a subcollection of anything effectively...
+        return True
+    if not this_collection_type.endswith(collection_type) or this_collection_type == collection_type:
+        return False
+    return True
+
+
 def _split_dataset_collection(dataset_collection, collection_type):
     this_collection_type = dataset_collection.collection_type
-    if not this_collection_type.endswith(collection_type) or this_collection_type == collection_type:
+    is_this_collection_nested = ":" in this_collection_type
+    if not _is_a_subcollection_type(this_collection_type, collection_type):
         raise exceptions.MessageException("Cannot split collection in desired fashion.")
 
     split_elements = []
     for element in dataset_collection.elements:
+        if not is_this_collection_nested and collection_type == "single_datasets":
+            split_elements.append(PromoteCollectionElementToCollectionAdapter(element))
+            continue
+
         child_collection = element.child_collection
         if child_collection is None:
             raise exceptions.MessageException("Cannot split collection in desired fashion.")

diff --git a/lib/galaxy/model/dataset_collections/type_description.py b/lib/galaxy/model/dataset_collections/type_description.py
@@ -51,9 +51,12 @@ def effective_collection_type(self, subcollection_type):
         if not self.has_subcollections_of_type(subcollection_type):
             raise ValueError(f"Cannot compute effective subcollection type of {subcollection_type} over {self}")
 
+        if subcollection_type == "single_datasets":
+            return self.collection_type
+
         return self.collection_type[: -(len(subcollection_type) + 1)]
 
-    def has_subcollections_of_type(self, other_collection_type):
+    def has_subcollections_of_type(self, other_collection_type) -> bool:
         """Take in another type (either flat string or another
         CollectionTypeDescription) and determine if this collection contains
         subcollections matching that type.
@@ -65,18 +68,37 @@ def has_subcollections_of_type(self, other_collection_type):
         if hasattr(other_collection_type, "collection_type"):
             other_collection_type = other_collection_type.collection_type
         collection_type = self.collection_type
-        return collection_type.endswith(other_collection_type) and collection_type != other_collection_type
+        if collection_type == other_collection_type:
+            return False
+        if collection_type.endswith(other_collection_type):
+            return True
+        if other_collection_type == "paired_or_unpaired":
+            # this can be thought of as a subcollection of anything except a pair
+            # since it would match a pair exactly
+            return collection_type != "paired"
+        if other_collection_type == "single_datasets":
+            # effectively any collection has unpaired subcollections
+            return True
+        return False
 
     def is_subcollection_of_type(self, other_collection_type):
         if not hasattr(other_collection_type, "collection_type"):
             other_collection_type = self.collection_type_description_factory.for_collection_type(other_collection_type)
         return other_collection_type.has_subcollections_of_type(self)
 
-    def can_match_type(self, other_collection_type):
+    def can_match_type(self, other_collection_type) -> bool:
         if hasattr(other_collection_type, "collection_type"):
             other_collection_type = other_collection_type.collection_type
         collection_type = self.collection_type
-        return other_collection_type == collection_type
+        if other_collection_type == collection_type:
+            return True
+        elif other_collection_type == "paired" and collection_type == "paired_or_unpaired":
+            return True
+        elif other_collection_type == "paired_or_unpaired" and collection_type == "paired":
+            return True
+
+        # can we push this to the type registry somehow?
+        return False
 
     def subcollection_type_description(self):
         if not self.__has_subcollections:

diff --git a/lib/galaxy/model/dataset_collections/types/__init__.py b/lib/galaxy/model/dataset_collections/types/__init__.py
@@ -21,3 +21,11 @@ def generate_elements(self, dataset_instances: dict, **kwds):
 class BaseDatasetCollectionType(DatasetCollectionType):
     def _validation_failed(self, message):
         raise exceptions.ObjectAttributeInvalidException(message)
+
+    def _ensure_dataset_with_identifier(self, dataset_instances: dict, name: str):
+        dataset_instance = dataset_instances.get(name)
+        if dataset_instance is None:
+            raise exceptions.ObjectAttributeInvalidException(
+                f"An element with the identifier {name} is required to create this collection type"
+            )
+        return dataset_instance
diff --git a/lib/galaxy/model/dataset_collections/types/paired.py b/lib/galaxy/model/dataset_collections/types/paired.py
@@ -1,3 +1,4 @@
+from galaxy.exceptions import RequestParameterInvalidException
 from galaxy.model import (
     DatasetCollectionElement,
     HistoryDatasetAssociation,
@@ -16,13 +17,19 @@ class PairedDatasetCollectionType(BaseDatasetCollectionType):
     collection_type = "paired"
 
     def generate_elements(self, dataset_instances, **kwds):
-        if forward_dataset := dataset_instances.get(FORWARD_IDENTIFIER):
+        num_datasets = len(dataset_instances)
+        if num_datasets != 2:
+            raise RequestParameterInvalidException(
+                f"Incorrect number of datasets - 2 datasets exactly are required to create a single_or_paired collection"
+            )
+
+        if forward_dataset := self._ensure_dataset_with_identifier(dataset_instances, FORWARD_IDENTIFIER):
             left_association = DatasetCollectionElement(
                 element=forward_dataset,
                 element_identifier=FORWARD_IDENTIFIER,
             )
             yield left_association
-        if reverse_dataset := dataset_instances.get(REVERSE_IDENTIFIER):
+        if reverse_dataset := self._ensure_dataset_with_identifier(dataset_instances, REVERSE_IDENTIFIER):
             right_association = DatasetCollectionElement(
                 element=reverse_dataset,
                 element_identifier=REVERSE_IDENTIFIER,

diff --git a/lib/galaxy/model/dataset_collections/types/paired_or_unpaired.py b/lib/galaxy/model/dataset_collections/types/paired_or_unpaired.py
@@ -0,0 +1,46 @@
+from galaxy.exceptions import RequestParameterInvalidException
+from galaxy.model import (
+    DatasetCollectionElement,
+    HistoryDatasetAssociation,
+)
+from . import BaseDatasetCollectionType
+from .paired import (
+    FORWARD_IDENTIFIER,
+    REVERSE_IDENTIFIER,
+)
+
+SINGLETON_IDENTIFIER = "unpaired"
+
+
+class PairedOrUnpairedDatasetCollectionType(BaseDatasetCollectionType):
+    """ """
+
+    collection_type = "paired_or_unpaired"
+
+    def generate_elements(self, dataset_instances, **kwds):
+        num_datasets = len(dataset_instances)
+        if num_datasets > 2 or num_datasets < 1:
+            raise RequestParameterInvalidException(
+                f"Incorrect number of datasets - 1 or 2 datasets is required to create a paired_or_unpaired collection"
+            )
+
+        if num_datasets == 2:
+            if forward_dataset := self._ensure_dataset_with_identifier(dataset_instances, FORWARD_IDENTIFIER):
+                left_association = DatasetCollectionElement(
+                    element=forward_dataset,
+                    element_identifier=FORWARD_IDENTIFIER,
+                )
+                yield left_association
+            if reverse_dataset := self._ensure_dataset_with_identifier(dataset_instances, REVERSE_IDENTIFIER):
+                right_association = DatasetCollectionElement(
+                    element=reverse_dataset,
+                    element_identifier=REVERSE_IDENTIFIER,
+                )
+                yield right_association
+        else:
+            if single_datasets := self._ensure_dataset_with_identifier(dataset_instances, SINGLETON_IDENTIFIER):
+                single_association = DatasetCollectionElement(
+                    element=single_datasets,
+                    element_identifier=SINGLETON_IDENTIFIER,
+                )
+                yield single_association
diff --git a/lib/galaxy/schema/schema.py b/lib/galaxy/schema/schema.py
@@ -33,6 +33,8 @@
 from typing_extensions import (
     Annotated,
     Literal,
+    NotRequired,
+    TypedDict,
 )
 
 from galaxy.schema import partial_model

diff --git a/lib/galaxy/tool_util/parameters/models.py b/lib/galaxy/tool_util/parameters/models.py
@@ -33,6 +33,7 @@
     StrictInt,
     StrictStr,
     Tag,
+    TypeAdapter,
     ValidationError,
 )
 from typing_extensions import (
@@ -329,6 +330,7 @@ def request_requires_value(self) -> bool:
 
 DataSrcT = Literal["hda", "ldda"]
 MultiDataSrcT = Literal["hda", "ldda", "hdca"]
+# @jmchilton you meant CollectionSrcT - fix that at some point please.
 CollectionStrT = Literal["hdca"]
 
 TestCaseDataSrcT = Literal["File"]
@@ -527,6 +529,78 @@ class DataCollectionRequestInternal(StrictModel):
     id: StrictInt
 
 
+CollectionAdapterSrcT = Literal["CollectionAdapter"]
+
+
+class AdaptedDataCollectionRequestBase(StrictModel):
+    src: CollectionAdapterSrcT
+
+
+class AdaptedDataCollectionPromoteDatasetToCollectionRequest(AdaptedDataCollectionRequestBase):
+    adapter_type: Literal["PromoteDatasetToCollection"]
+    collection_type: Literal["list", "paired_or_unpaired"]
+    adapting: DataRequestHda
+
+
+# calling this name and element_identifier to align with fetch API, etc...
+class AdapterElementRequest(DataRequestHda):
+    name: str  # element_identifier
+
+
+class AdaptedDataCollectionPromoteDatasetsToCollectionRequest(AdaptedDataCollectionRequestBase):
+    adapter_type: Literal["PromoteDatasetsToCollection"]
+    # could allow list in here without changing much else I think but I'm trying to keep these tight in scope
+    collection_type: Literal["paired", "paired_or_unpaired"]
+    adapting: List[AdapterElementRequest]
+
+
+AdaptedDataCollectionRequest = Annotated[
+    Union[
+        AdaptedDataCollectionPromoteDatasetToCollectionRequest, AdaptedDataCollectionPromoteDatasetsToCollectionRequest
+    ],
+    Field(discriminator="adapter_type"),
+]
+AdaptedDataCollectionRequestTypeAdapter = TypeAdapter(AdaptedDataCollectionRequest)
+
+
+class DatasetCollectionElementReference(StrictModel):
+    src: Literal["dce"]
+    id: StrictInt
+
+
+class AdaptedDataCollectionPromoteCollectionElementToCollectionRequestInternal(AdaptedDataCollectionRequestBase):
+    adapter_type: Literal["PromoteCollectionElementToCollection"]
+    adapting: DatasetCollectionElementReference
+
+
+class AdaptedDataCollectionPromoteDatasetToCollectionRequestInternal(AdaptedDataCollectionRequestBase):
+    adapter_type: Literal["PromoteDatasetToCollection"]
+    collection_type: Literal["list", "paired_or_unpaired"]
+    adapting: DataRequestInternalHda
+
+
+class AdapterElementRequestInternal(DataRequestInternalHda):
+    name: str  # element_identifier
+
+
+class AdaptedDataCollectionPromoteDatasetsToCollectionRequestInternal(AdaptedDataCollectionRequestBase):
+    adapter_type: Literal["PromoteDatasetsToCollection"]
+    # could allow list in here without changing much else I think but I'm trying to keep these tight in scope
+    collection_type: Literal["paired", "paired_or_unpaired"]
+    adapting: List[AdapterElementRequestInternal]
+
+
+AdaptedDataCollectionRequestInternal = Annotated[
+    Union[
+        AdaptedDataCollectionPromoteCollectionElementToCollectionRequestInternal,
+        AdaptedDataCollectionPromoteDatasetToCollectionRequestInternal,
+        AdaptedDataCollectionPromoteDatasetsToCollectionRequestInternal,
+    ],
+    Field(discriminator="adapter_type"),
+]
+AdaptedDataCollectionRequestInternalTypeAdapter = TypeAdapter(AdaptedDataCollectionRequestInternal)
+
+
 class DataCollectionParameterModel(BaseGalaxyToolParameterModelDefinition):
     parameter_type: Literal["gx_data_collection"] = "gx_data_collection"
     collection_type: Optional[str] = None

diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py
@@ -3534,6 +3534,63 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history
         )
 
 
+class SplitPairedAndUnpairedTool(DatabaseOperationTool):
+    tool_type = "split_paired_and_unpaired"
+    require_terminal_states = False
+    require_dataset_ok = False
+
+    def produce_outputs(self, trans, out_data, output_collections, incoming, history, **kwds):
+        has_collection = incoming["input"]
+        if hasattr(has_collection, "element_type"):
+            # It is a DCE
+            collection = has_collection.element_object
+        else:
+            # It is an HDCA
+            collection = has_collection.collection
+
+        collection_type = collection.collection_type
+        assert collection_type in ["list", "list:paired", "list:paired_or_unpaired"]
+
+        unpaired_dce_copies = {}
+        paired_dce_copies = {}
+        paired_datasets = []
+
+        def _handle_unpaired(dce):
+            element_identifier = dce.element_identifier
+            assert getattr(dce.element_object, "history_content_type", None) == "dataset"
+            copied_value = dce.element_object.copy(copy_tags=dce.element_object.tags, flush=False)
+            unpaired_dce_copies[element_identifier] = copied_value
+
+        def _handle_paired(dce):
+            element_identifier = dce.element_identifier
+            copied_value = dce.element_object.copy(flush=False)
+            paired_dce_copies[element_identifier] = copied_value
+            paired_datasets.append(copied_value.elements[0].element_object)
+            paired_datasets.append(copied_value.elements[1].element_object)
+
+        if collection_type == "list":
+            for element in collection.elements:
+                _handle_unpaired(element)
+        elif collection_type == "list:paired":
+            for element in collection.elements:
+                _handle_paired(element)
+        elif collection_type == "list:paired_or_unpaired":
+            for element in collection.elements:
+                if getattr(element.element_object, "history_content_type", None) == "dataset":
+                    _handle_unpaired(element)
+                else:
+                    _handle_paired(element)
+
+        self._add_datasets_to_history(history, unpaired_dce_copies.values())
+        self._add_datasets_to_history(history, paired_datasets)
+        output_collections.create_collection(
+            self.outputs["output_unpaired"], "output_unpaired", elements=unpaired_dce_copies, propagate_hda_tags=False
+        )
+        output_collections.create_collection(
+            self.outputs["output_paired"], "output_paired", elements=paired_dce_copies, propagate_hda_tags=False
+        )
+
+
 class ExtractDatasetCollectionTool(DatabaseOperationTool):
     tool_type = "extract_dataset"
     require_terminal_states = False