From 6be29def66a03fc4d95932e719cb80a351d4ff85 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 7 Jun 2024 09:36:43 +0200 Subject: [PATCH] Don't push purged dataset contents to object store This in particular needs a lot of new tests. We will also need to actively purge datasets in the model store import code, since users might have purged datasets while the job ran. Again, more tests needed. --- lib/galaxy/metadata/set_metadata.py | 4 ++-- lib/galaxy/model/__init__.py | 15 ++++++++------- lib/galaxy/model/store/__init__.py | 21 +++++++++++---------- lib/galaxy/model/store/discover.py | 3 +++ lib/galaxy/objectstore/__init__.py | 2 +- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/lib/galaxy/metadata/set_metadata.py b/lib/galaxy/metadata/set_metadata.py index 8f1f9ea86be9..d287a9a84c94 100644 --- a/lib/galaxy/metadata/set_metadata.py +++ b/lib/galaxy/metadata/set_metadata.py @@ -96,7 +96,7 @@ def push_if_necessary(object_store: ObjectStore, dataset: DatasetInstance, exter # or a remote object store from its cache path. # empty files could happen when outputs are discovered from working dir, # empty file check needed for e.g. test/integration/test_extended_metadata_outputs_to_working_directory.py::test_tools[multi_output_assign_primary] - if os.path.getsize(external_filename): + if not dataset.dataset.purged and os.path.getsize(external_filename): object_store.update_from_file(dataset.dataset, file_name=external_filename, create=True) @@ -477,7 +477,7 @@ def set_meta(new_dataset_instance, file_dict): object_store_update_actions.append(partial(reset_external_filename, dataset)) object_store_update_actions.append(partial(dataset.set_total_size)) object_store_update_actions.append(partial(export_store.add_dataset, dataset)) - if dataset_instance_id not in unnamed_id_to_path: + if dataset_instance_id not in unnamed_id_to_path and not dataset.dataset.purged: object_store_update_actions.append(partial(collect_extra_files, object_store, dataset, ".")) dataset_state = "deferred" if (is_deferred and final_job_state == "ok") else final_job_state if not dataset.state == dataset.states.ERROR: diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 97ff0f6b0fbf..582da171d315 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -9485,13 +9485,14 @@ def dataset(self) -> Optional[Dataset]: def update_from_file(self, file_name): if not self.dataset: raise Exception("Attempted to write MetadataFile, but no DatasetAssociation set") - self.dataset.object_store.update_from_file( - self, - file_name=file_name, - extra_dir="_metadata_files", - extra_dir_at_root=True, - alt_name=os.path.basename(self.get_file_name()), - ) + if not self.dataset.purged: + self.dataset.object_store.update_from_file( + self, + file_name=file_name, + extra_dir="_metadata_files", + extra_dir_at_root=True, + alt_name=os.path.basename(self.get_file_name()), + ) def get_file_name(self, sync_cache=True): # Ensure the directory structure and the metadata file object exist diff --git a/lib/galaxy/model/store/__init__.py b/lib/galaxy/model/store/__init__.py index 1b78f52ebcc8..42c6e927e2c1 100644 --- a/lib/galaxy/model/store/__init__.py +++ b/lib/galaxy/model/store/__init__.py @@ -654,17 +654,18 @@ def handle_dataset_object_edit(dataset_instance, dataset_attrs): dataset_instance.state = dataset_state if not self.object_store: raise Exception(f"self.object_store is missing from {self}.") - self.object_store.update_from_file( - dataset_instance.dataset, file_name=temp_dataset_file_name, create=True - ) + if not dataset_instance.dataset.purged: + self.object_store.update_from_file( + dataset_instance.dataset, file_name=temp_dataset_file_name, create=True + ) - # Import additional files if present. Histories exported previously might not have this attribute set. - dataset_extra_files_path = dataset_attrs.get("extra_files_path", None) - if dataset_extra_files_path: - assert file_source_root - dataset_extra_files_path = os.path.join(file_source_root, dataset_extra_files_path) - persist_extra_files(self.object_store, dataset_extra_files_path, dataset_instance) - # Don't trust serialized file size + # Import additional files if present. Histories exported previously might not have this attribute set. + dataset_extra_files_path = dataset_attrs.get("extra_files_path", None) + if dataset_extra_files_path: + assert file_source_root + dataset_extra_files_path = os.path.join(file_source_root, dataset_extra_files_path) + persist_extra_files(self.object_store, dataset_extra_files_path, dataset_instance) + # Don't trust serialized file size dataset_instance.dataset.file_size = None dataset_instance.dataset.set_total_size() # update the filesize record in the database diff --git a/lib/galaxy/model/store/discover.py b/lib/galaxy/model/store/discover.py index 9329bda9ee40..2db3749feac9 100644 --- a/lib/galaxy/model/store/discover.py +++ b/lib/galaxy/model/store/discover.py @@ -214,6 +214,9 @@ def create_dataset( return primary_data def finalize_storage(self, primary_data, dataset_attributes, extra_files, filename, link_data, output_name): + if primary_data.dataset.purged: + # metadata won't be set, maybe we should do that, then purge ? + return # Move data from temp location to dataset location if not link_data: dataset = primary_data.dataset diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py index f6bb0f4b07ae..49b559525ea9 100644 --- a/lib/galaxy/objectstore/__init__.py +++ b/lib/galaxy/objectstore/__init__.py @@ -1670,7 +1670,7 @@ def persist_extra_files( primary_data: "DatasetInstance", extra_files_path_name: Optional[str] = None, ) -> None: - if os.path.exists(src_extra_files_path): + if not primary_data.dataset.purged and os.path.exists(src_extra_files_path): assert primary_data.dataset if not extra_files_path_name: extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store)