Skip to content

Commit

Permalink
WIP: unpack directory to extra_files_path
Browse files Browse the repository at this point in the history
TODO:

 - [x] test for local file upload
 - [x] delete source
 - [ ] test for deferred upload
 - [ ] workflow test for directory pass-through and manipulation
 - [ ] record transform ?
  • Loading branch information
mvdbeek committed Nov 1, 2023
1 parent a3e6d5d commit a993e7b
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 11 deletions.
6 changes: 6 additions & 0 deletions lib/galaxy/datatypes/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,6 +1211,12 @@ def _archive_main_file(
error, msg, messagetype = False, "", ""
return (error, msg, messagetype)

@staticmethod
def to_directory(path: str, extra_files_path: str, remove_source: bool = False):
compression_utils.CompressedFile(path).extract(extra_files_path)
if remove_source:
os.remove(path)


class GenericAsn1(Text):
"""Class for generic ASN.1 text format"""
Expand Down
18 changes: 17 additions & 1 deletion lib/galaxy/model/deferred.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from galaxy.objectstore import (
ObjectStore,
ObjectStorePopulator,
persist_extra_files,
)

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -128,16 +129,31 @@ def ensure_materialized(
sa_session.commit()
object_store_populator.set_dataset_object_store_id(materialized_dataset)
path = self._stream_source(target_source, datatype=dataset_instance.datatype)
if dataset_instance.extension == "directory":
dataset_instance.datatype.to_directory(path, materialized_dataset.extra_files_path)
persist_extra_files(
object_store=object_store,
src_extra_files_path=materialized_dataset.extra_files_path,
primary_data=materialized_dataset,
)
with open(path, "w") as out:
# just create zero length file, we probably need this to find
# the object store
pass
object_store.update_from_file(materialized_dataset, file_name=path)

else:
transient_path_mapper = self._transient_path_mapper
assert transient_path_mapper
transient_paths = transient_path_mapper.transient_paths_for(dataset)
# TODO: optimize this by streaming right to this path...
# TODO: take into acount transform and ensure we are and are not modifying the file as appropriate.
path = self._stream_source(target_source, datatype=dataset_instance.datatype)
shutil.move(path, transient_paths.external_filename)
materialized_dataset.external_filename = transient_paths.external_filename
if dataset_instance.ext == "directory":
dataset_instance.datatype.to_directory(path, transient_paths.external_extra_files_path)
else:
shutil.move(path, transient_paths.external_filename)

history = target_history
if history is None and isinstance(dataset_instance, HistoryDatasetAssociation):
Expand Down
8 changes: 7 additions & 1 deletion lib/galaxy/tools/data_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,11 @@ def _resolve_item_with_primary(item):
elif not link_data_only:
path = upload_config.ensure_in_working_directory(path, purge_source, in_place)

extra_files_path = f"{path}_extra"
extra_files = item.get("extra_files")
if extra_files:
# TODO: optimize to just copy the whole directory to extra files instead.
assert not upload_config.link_data_only, "linking composite dataset files not yet implemented"
extra_files_path = f"{path}_extra"
staged_extra_files = extra_files_path
os.mkdir(extra_files_path)

Expand Down Expand Up @@ -361,6 +361,12 @@ def walk_extra_files(items, prefix=""):
assert path
datatype.groom_dataset_content(path)

if hasattr(datatype, "to_directory"):
datatype.to_directory(path, extra_files_path)
staged_extra_files = extra_files_path
if not link_data_only:
os.remove(path)

if len(transform) > 0:
source_dict["transform"] = transform
elif not error_message:
Expand Down
40 changes: 31 additions & 9 deletions lib/galaxy_test/api/test_tools_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from galaxy.tool_util.verify.test_data import TestDataResolver
from galaxy.util import UNKNOWN
from galaxy.util.compression_utils import decompress_bytes_to_directory
from galaxy.util.unittest_utils import (
skip_if_github_down,
skip_if_site_down,
Expand All @@ -25,6 +26,14 @@
)
from ._framework import ApiTestCase

EXPECTED_CONTENTS = {
"testdir": "Directory",
"testdir/c": "Directory",
"testdir/a": "File",
"testdir/b": "File",
"testdir/c/d": "File",
}


class TestToolsUpload(ApiTestCase):
dataset_populator: DatasetPopulator
Expand Down Expand Up @@ -600,18 +609,11 @@ def _check_testdir_composite(self, dataset, history_id):
assert content.strip() == "Test123"
extra_files = self.dataset_populator.get_history_dataset_extra_files(history_id, dataset_id=dataset["id"])
assert len(extra_files) == 5, extra_files
expected_contents = {
"testdir": "Directory",
"testdir/c": "Directory",
"testdir/a": "File",
"testdir/b": "File",
"testdir/c/d": "File",
}
found_files = set()
for extra_file in extra_files:
path = extra_file["path"]
assert path in expected_contents
assert extra_file["class"] == expected_contents[path]
assert path in EXPECTED_CONTENTS
assert extra_file["class"] == EXPECTED_CONTENTS[path]
found_files.add(path)

assert len(found_files) == 5, found_files
Expand Down Expand Up @@ -657,6 +659,26 @@ def test_upload_bam_file(self):
assert details["state"] == "ok"
assert details["file_ext"] == "bam", details

def test_fetch_directory(self, history_id):
testdir = TestDataResolver().get_filename("testdir.tar")
with open(testdir, "rb") as fh:
details = self._upload_and_get_details(
fh, api="fetch", history_id=history_id, ext="directory", assert_ok=True
)
assert details["file_ext"] == "directory"
# assert details["file_size"] == something
content = self.dataset_populator.get_history_dataset_content(
history_id, dataset=details, to_ext="directory", type="bytes"
)
dir_path = decompress_bytes_to_directory(content)
assert dir_path.endswith("testdir")
for path, entry_class in EXPECTED_CONTENTS.items():
path = os.path.join(dir_path, os.path.pardir, path)
if entry_class == "Directory":
assert os.path.isdir(path)
else:
assert os.path.isfile(path)

def test_fetch_metadata(self):
table = ONE_TO_SIX_WITH_SPACES
details = self._upload_and_get_details(
Expand Down

0 comments on commit a993e7b

Please sign in to comment.