Skip to content

Commit

Permalink
Merge pull request galaxyproject#17346 from jmchilton/model_store_fixes
Browse files Browse the repository at this point in the history
Fix model store exports that include implicit conversions.
  • Loading branch information
mvdbeek authored Feb 8, 2024
2 parents af169f7 + 4157042 commit 5f25655
Show file tree
Hide file tree
Showing 4 changed files with 261 additions and 83 deletions.
36 changes: 31 additions & 5 deletions lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4706,12 +4706,14 @@ def get_converted_dataset(self, trans, target_ext, target_context=None, history=
).values()
)
)
return self.attach_implicitly_converted_dataset(trans.sa_session, new_dataset, target_ext)

def attach_implicitly_converted_dataset(self, session, new_dataset, target_ext: str):
new_dataset.name = self.name
self.copy_attributes(new_dataset)
assoc = ImplicitlyConvertedDatasetAssociation(
parent=self, file_type=target_ext, dataset=new_dataset, metadata_safe=False
)
session = trans.sa_session
session.add(new_dataset)
session.add(assoc)
with transaction(session):
Expand Down Expand Up @@ -6048,7 +6050,7 @@ def inheritable(self):
return True # always allow inheriting, used for replacement


class ImplicitlyConvertedDatasetAssociation(Base, RepresentById):
class ImplicitlyConvertedDatasetAssociation(Base, Serializable):
__tablename__ = "implicitly_converted_dataset_association"

id = Column(Integer, primary_key=True)
Expand Down Expand Up @@ -6086,22 +6088,34 @@ class ImplicitlyConvertedDatasetAssociation(Base, RepresentById):
)

def __init__(
self, id=None, parent=None, dataset=None, file_type=None, deleted=False, purged=False, metadata_safe=True
self,
id=None,
parent=None,
dataset=None,
file_type=None,
deleted=False,
purged=False,
metadata_safe=True,
for_import=False,
):
self.id = id
add_object_to_object_session(self, dataset)
if isinstance(dataset, HistoryDatasetAssociation):
self.dataset = dataset
elif isinstance(dataset, LibraryDatasetDatasetAssociation):
self.dataset_ldda = dataset
else:
elif not for_import:
raise AttributeError(f"Unknown dataset type provided for dataset: {type(dataset)}")
# else if for import - these connections might not have been included in the store,
# recover the data we can?
if isinstance(parent, HistoryDatasetAssociation):
self.parent_hda = parent
elif isinstance(parent, LibraryDatasetDatasetAssociation):
self.parent_ldda = parent
else:
elif not for_import:
raise AttributeError(f"Unknown dataset type provided for parent: {type(parent)}")
# else if for import - these connections might not have been included in the store,
# recover the data we can?
self.type = file_type
self.deleted = deleted
self.purged = purged
Expand All @@ -6121,6 +6135,18 @@ def clear(self, purge=False, delete_dataset=True):
except Exception as e:
log.error(f"Failed to purge associated file ({self.get_file_name()}) from disk: {unicodify(e)}")

def _serialize(self, id_encoder, serialization_options):
rval = dict_for(
self,
file_type=self.type,
)
if self.parent_hda:
rval["parent_hda"] = serialization_options.get_identifier(id_encoder, self.parent_hda)
if self.dataset:
rval["hda"] = serialization_options.get_identifier(id_encoder, self.dataset)
serialization_options.attach_identifier(id_encoder, self, rval)
return rval


DEFAULT_COLLECTION_NAME = "Unnamed Collection"

Expand Down
101 changes: 94 additions & 7 deletions lib/galaxy/model/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,14 @@
ATTRS_FILENAME_LIBRARIES = "libraries_attrs.txt"
ATTRS_FILENAME_LIBRARY_FOLDERS = "library_folders_attrs.txt"
ATTRS_FILENAME_INVOCATIONS = "invocation_attrs.txt"
ATTRS_FILENAME_CONVERSIONS = "implicit_dataset_conversions.txt"
TRACEBACK = "traceback.txt"
GALAXY_EXPORT_VERSION = "2"

DICT_STORE_ATTRS_KEY_HISTORY = "history"
DICT_STORE_ATTRS_KEY_DATASETS = "datasets"
DICT_STORE_ATTRS_KEY_COLLECTIONS = "collections"
DICT_STORE_ATTRS_KEY_CONVERSIONS = "implicit_dataset_conversions"
DICT_STORE_ATTRS_KEY_JOBS = "jobs"
DICT_STORE_ATTRS_KEY_IMPLICIT_COLLECTION_JOBS = "implicit_collection_jobs"
DICT_STORE_ATTRS_KEY_LIBRARIES = "libraries"
Expand Down Expand Up @@ -297,6 +299,10 @@ def invocations_properties(self) -> List[Dict[str, Any]]: ...
def collections_properties(self) -> List[Dict[str, Any]]:
"""Return a list of HDCA properties."""

@abc.abstractmethod
def implicit_dataset_conversion_properties(self) -> List[Dict[str, Any]]:
"""Return a list of ImplicitlyConvertedDatasetAssociation properties."""

@abc.abstractmethod
def jobs_properties(self) -> List[Dict[str, Any]]:
"""Return a list of jobs properties."""
Expand Down Expand Up @@ -385,6 +391,7 @@ def perform_import(
self._import_collection_instances(object_import_tracker, collections_attrs, history, new_history)
self._import_collection_implicit_input_associations(object_import_tracker, collections_attrs)
self._import_collection_copied_associations(object_import_tracker, collections_attrs)
self._import_implicit_dataset_conversions(object_import_tracker)
self._reassign_hids(object_import_tracker, history)
self._import_jobs(object_import_tracker, history)
self._import_implicit_collection_jobs(object_import_tracker)
Expand Down Expand Up @@ -1001,6 +1008,13 @@ def _reassign_hids(self, object_import_tracker: "ObjectImportTracker", history:
history.stage_addition(obj)
history.add_pending_items()

if object_import_tracker.copy_hid_for:
# in an if to avoid flush if unneeded
for from_dataset, to_dataset in object_import_tracker.copy_hid_for.items():
to_dataset.hid = from_dataset.hid
self._session_add(to_dataset)
self._flush()

def _import_workflow_invocations(
self, object_import_tracker: "ObjectImportTracker", history: Optional[model.History]
) -> None:
Expand Down Expand Up @@ -1242,6 +1256,29 @@ def _import_jobs(self, object_import_tracker: "ObjectImportTracker", history: Op
if object_key in job_attrs:
object_import_tracker.jobs_by_key[job_attrs[object_key]] = imported_job

def _import_implicit_dataset_conversions(self, object_import_tracker: "ObjectImportTracker") -> None:
implicit_dataset_conversion_attrs = self.implicit_dataset_conversion_properties()
for idc_attrs in implicit_dataset_conversion_attrs:
# I don't know what metadata_safe does per se... should we copy this property or
# just set it to False?
metadata_safe = False
idc = model.ImplicitlyConvertedDatasetAssociation(metadata_safe=metadata_safe, for_import=True)
idc.type = idc_attrs["file_type"]
if idc_attrs.get("parent_hda"):
idc.parent_hda = object_import_tracker.hdas_by_key[idc_attrs["parent_hda"]]
if idc_attrs.get("hda"):
idc.dataset = object_import_tracker.hdas_by_key[idc_attrs["hda"]]

# we have a the dataset and the parent, lets ensure they land up with the same HID
if idc.dataset and idc.parent_hda and idc.parent_hda in object_import_tracker.requires_hid:
try:
object_import_tracker.requires_hid.remove(idc.dataset)
except ValueError:
pass # we wanted to remove it anyway.
object_import_tracker.copy_hid_for[idc.parent_hda] = idc.dataset

self._session_add(idc)

def _import_implicit_collection_jobs(self, object_import_tracker: "ObjectImportTracker") -> None:
object_key = self.object_key

Expand Down Expand Up @@ -1303,6 +1340,9 @@ def _copied_from_object_key(
return copied_from_object_key


HasHid = Union[model.HistoryDatasetAssociation, model.HistoryDatasetCollectionAssociation]


class ObjectImportTracker:
"""Keep track of new and existing imported objects.
Expand All @@ -1320,7 +1360,8 @@ class ObjectImportTracker:
hda_copied_from_sinks: Dict[ObjectKeyType, ObjectKeyType]
hdca_copied_from_sinks: Dict[ObjectKeyType, ObjectKeyType]
jobs_by_key: Dict[ObjectKeyType, model.Job]
requires_hid: List[Union[model.HistoryDatasetAssociation, model.HistoryDatasetCollectionAssociation]]
requires_hid: List[HasHid]
copy_hid_for: Dict[HasHid, HasHid]

def __init__(self) -> None:
self.libraries_by_key = {}
Expand All @@ -1338,6 +1379,7 @@ def __init__(self) -> None:
self.implicit_collection_jobs_by_key: Dict[str, ImplicitCollectionJobs] = {}
self.workflows_by_key: Dict[str, model.Workflow] = {}
self.requires_hid = []
self.copy_hid_for = {}

self.new_history: Optional[model.History] = None

Expand Down Expand Up @@ -1421,6 +1463,9 @@ def datasets_properties(
def collections_properties(self) -> List[Dict[str, Any]]:
return self._store_as_dict.get(DICT_STORE_ATTRS_KEY_COLLECTIONS) or []

def implicit_dataset_conversion_properties(self) -> List[Dict[str, Any]]:
return self._store_as_dict.get(DICT_STORE_ATTRS_KEY_CONVERSIONS) or []

def library_properties(
self,
) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -1495,6 +1540,9 @@ def datasets_properties(self) -> List[Dict[str, Any]]:
def collections_properties(self) -> List[Dict[str, Any]]:
return self._read_list_if_exists(ATTRS_FILENAME_COLLECTIONS)

def implicit_dataset_conversion_properties(self) -> List[Dict[str, Any]]:
return self._read_list_if_exists(ATTRS_FILENAME_CONVERSIONS)

def library_properties(
self,
) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -1862,6 +1910,7 @@ def __init__(
)
self.export_files = export_files
self.included_datasets: Dict[model.DatasetInstance, Tuple[model.DatasetInstance, bool]] = {}
self.dataset_implicit_conversions: Dict[model.DatasetInstance, model.ImplicitlyConvertedDatasetAssociation] = {}
self.included_collections: List[Union[model.DatasetCollection, model.HistoryDatasetCollectionAssociation]] = []
self.included_libraries: List[model.Library] = []
self.included_library_folders: List[model.LibraryFolder] = []
Expand Down Expand Up @@ -1930,9 +1979,14 @@ def add(src, dest):
if not os.path.exists(dir_path):
os.makedirs(dir_path)

target_filename = get_export_dataset_filename(as_dict["name"], as_dict["extension"], dataset_hid)
conversion = self.dataset_implicit_conversions.get(dataset)
conversion_key = (
self.serialization_options.get_identifier(self.security, conversion) if conversion else None
)
target_filename = get_export_dataset_filename(
as_dict["name"], as_dict["extension"], dataset_hid, conversion_key=conversion_key
)
arcname = os.path.join(dir_name, target_filename)

src = file_name
dest = os.path.join(export_directory, arcname)
add(src, dest)
Expand All @@ -1945,7 +1999,10 @@ def add(src, dest):
file_list = []

if len(file_list):
arcname = os.path.join(dir_name, f"extra_files_path_{dataset_hid}")
extra_files_target_filename = get_export_dataset_extra_files_dir_name(
as_dict["name"], as_dict["extension"], dataset_hid, conversion_key=conversion_key
)
arcname = os.path.join(dir_name, extra_files_target_filename)
add(extra_files_path, os.path.join(export_directory, arcname))
as_dict["extra_files_path"] = arcname
else:
Expand Down Expand Up @@ -2140,7 +2197,13 @@ def export_history(
if dataset not in self.included_datasets:
if should_include_file:
self._ensure_dataset_file_exists(dataset)
self.add_dataset(dataset, include_files=should_include_file)
if dataset.implicitly_converted_parent_datasets:
# fetching 0th of list but I think this is just a mapping quirk - I can't imagine how there
# would be more than one of these -John
conversion = dataset.implicitly_converted_parent_datasets[0]
self.add_implicit_conversion_dataset(dataset, should_include_file, conversion)
else:
self.add_dataset(dataset, include_files=should_include_file)

def export_library(
self, library: model.Library, include_hidden: bool = False, include_deleted: bool = False
Expand Down Expand Up @@ -2219,6 +2282,15 @@ def add_dataset_collection(
self.collections_attrs.append(collection)
self.included_collections.append(collection)

def add_implicit_conversion_dataset(
self,
dataset: model.DatasetInstance,
include_files: bool,
conversion: model.ImplicitlyConvertedDatasetAssociation,
) -> None:
self.included_datasets[dataset] = (dataset, include_files)
self.dataset_implicit_conversions[dataset] = conversion

def add_dataset(self, dataset: model.DatasetInstance, include_files: bool = True) -> None:
self.included_datasets[dataset] = (dataset, include_files)

Expand Down Expand Up @@ -2265,6 +2337,10 @@ def to_json(attributes):
with open(collections_attrs_filename, "w") as collections_attrs_out:
collections_attrs_out.write(to_json(self.collections_attrs))

conversions_attrs_filename = os.path.join(export_directory, ATTRS_FILENAME_CONVERSIONS)
with open(conversions_attrs_filename, "w") as conversions_attrs_out:
conversions_attrs_out.write(to_json(self.dataset_implicit_conversions.values()))

jobs_attrs = []
for job_id, job_output_dataset_associations in self.job_output_dataset_associations.items():
output_dataset_mapping: Dict[str, List[Union[str, int]]] = {}
Expand Down Expand Up @@ -2387,6 +2463,7 @@ class WriteCrates:
included_invocations: List[model.WorkflowInvocation]
export_directory: StrPath
included_datasets: Dict[model.DatasetInstance, Tuple[model.DatasetInstance, bool]]
dataset_implicit_conversions: Dict[model.DatasetInstance, model.ImplicitlyConvertedDatasetAssociation]
dataset_id_to_path: Dict[int, Tuple[Optional[str], Optional[str]]]

@property
Expand Down Expand Up @@ -2890,12 +2967,22 @@ def tar_export_directory(export_directory: StrPath, out_file: StrPath, gzip: boo
store_archive.add(os.path.join(export_directory, export_path), arcname=export_path)


def get_export_dataset_filename(name: str, ext: str, hid: int) -> str:
def get_export_dataset_filename(name: str, ext: str, hid: int, conversion_key: Optional[str]) -> str:
"""
Builds a filename for a dataset using its name an extension.
"""
base = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in name)
return f"{base}_{hid}.{ext}"
if not conversion_key:
return f"{base}_{hid}.{ext}"
else:
return f"{base}_{hid}_conversion_{conversion_key}.{ext}"


def get_export_dataset_extra_files_dir_name(name: str, ext: str, hid: int, conversion_key: Optional[str]) -> str:
if not conversion_key:
return f"extra_files_path_{hid}"
else:
return f"extra_files_path_{hid}_conversion_{conversion_key}"


def imported_store_for_metadata(
Expand Down
1 change: 1 addition & 0 deletions test/unit/data/model/2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
converted bed
Loading

0 comments on commit 5f25655

Please sign in to comment.