From 35bb7e2fc391d64a68a3341b7dfd081c73fe62e0 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Fri, 15 Dec 2017 13:55:35 -0500 Subject: [PATCH] Hierarchical upload API optimized for folders & collections. Allows describing hierarchical data in JSON or inferring structure from archives or directories. Datasets or archive sources can be specified via uploads, URLs, paths (if admin && allow_path_paste), library_import_dir/user_library_import_dir, and/or FTP imports. Unlike existing API endpoints, a mix of these on a per file basis is allowed and they work seemlessly between libraries and histories. Supported "archives" include gzip, zip, bagit directories, bagit achives (with fetching and validations of downloads). The existing upload API endpoint is quite rough to work with both in terms of adding parameters (e.g. the file type and dbkey hanlding in 4563 was difficult to implement, terribly hacky, and should seemingly have been trivial) and in terms of building requests (one needs to build a tool form - not describe sensible inputs in JSON). This API is built to be intelligable from an API standpoint instead of being constrained to the older style tool form. Additionally it built with hierarchical data in mind in a way that would not be easy at all enhancing the tool form components we don't even render. This implements 5159 though much simpler YAML descriptions of data libraries should be possible basically as the API descriptions. We can replace the data library script in Ephemeris https://github.com/galaxyproject/ephemeris/blob/master/ephemeris/setup_data_libraries.py with one that converts a simple YAML file into an API call and allows many new options for free. In future PRs I'll add filtering options to this and it will serve as the backend to 4733. --- lib/galaxy/actions/library.py | 5 +- lib/galaxy/app.py | 4 + lib/galaxy/datatypes/sniff.py | 7 + lib/galaxy/datatypes/upload_util.py | 47 +++ .../dependencies/pinned-requirements.txt | 1 + lib/galaxy/jobs/__init__.py | 2 +- lib/galaxy/managers/collections.py | 19 +- lib/galaxy/model/__init__.py | 6 + lib/galaxy/tools/__init__.py | 12 +- lib/galaxy/tools/actions/upload.py | 58 ++++ lib/galaxy/tools/actions/upload_common.py | 12 +- lib/galaxy/tools/data_fetch.py | 313 ++++++++++++++++++ lib/galaxy/tools/data_fetch.xml | 33 ++ lib/galaxy/tools/execute.py | 2 +- lib/galaxy/tools/parameters/output_collect.py | 210 +++++++++++- lib/galaxy/tools/special_tools.py | 1 + lib/galaxy/webapps/galaxy/api/_fetch_util.py | 205 ++++++++++++ lib/galaxy/webapps/galaxy/api/tools.py | 45 +++ lib/galaxy/webapps/galaxy/buildapp.py | 1 + scripts/api/fetch_to_library.py | 33 ++ scripts/api/fetch_to_library_example.yml | 42 +++ test-data/example-bag.zip | Bin 0 -> 2966 bytes test-data/testdir1.zip | Bin 0 -> 825 bytes test/api/test_dataset_collections.py | 72 ++++ test/api/test_libraries.py | 90 +++++ test/base/integration_util.py | 2 + test/base/populators.py | 49 ++- .../test_upload_configuration_options.py | 237 ++++++++++++- tools/data_source/upload.py | 42 +-- 29 files changed, 1478 insertions(+), 72 deletions(-) create mode 100644 lib/galaxy/datatypes/upload_util.py create mode 100644 lib/galaxy/tools/data_fetch.py create mode 100644 lib/galaxy/tools/data_fetch.xml create mode 100644 lib/galaxy/webapps/galaxy/api/_fetch_util.py create mode 100644 scripts/api/fetch_to_library.py create mode 100644 scripts/api/fetch_to_library_example.yml create mode 100644 test-data/example-bag.zip create mode 100644 test-data/testdir1.zip diff --git a/lib/galaxy/actions/library.py b/lib/galaxy/actions/library.py index 06b44cf3355f..dfd6256705a6 100644 --- a/lib/galaxy/actions/library.py +++ b/lib/galaxy/actions/library.py @@ -257,10 +257,7 @@ def _make_library_uploaded_dataset(self, trans, params, name, path, type, librar uploaded_dataset.link_data_only = link_data_only uploaded_dataset.uuid = uuid_str if link_data_only == 'link_to_files': - uploaded_dataset.data.file_name = os.path.abspath(path) - # Since we are not copying the file into Galaxy's managed - # default file location, the dataset should never be purgable. - uploaded_dataset.data.dataset.purgable = False + uploaded_dataset.data.link_to(path) trans.sa_session.add_all((uploaded_dataset.data, uploaded_dataset.data.dataset)) trans.sa_session.flush() return uploaded_dataset diff --git a/lib/galaxy/app.py b/lib/galaxy/app.py index 25de26171058..ce22ccd7f396 100644 --- a/lib/galaxy/app.py +++ b/lib/galaxy/app.py @@ -12,6 +12,8 @@ from galaxy import config, jobs from galaxy.jobs import metrics as job_metrics from galaxy.managers.collections import DatasetCollectionManager +from galaxy.managers.folders import FolderManager +from galaxy.managers.libraries import LibraryManager from galaxy.managers.tags import GalaxyTagManager from galaxy.openid.providers import OpenIDProviders from galaxy.queue_worker import GalaxyQueueWorker @@ -90,6 +92,8 @@ def __init__(self, **kwargs): self.tag_handler = GalaxyTagManager(self.model.context) # Dataset Collection Plugins self.dataset_collections_service = DatasetCollectionManager(self) + self.library_folder_manager = FolderManager() + self.library_manager = LibraryManager() # Tool Data Tables self._configure_tool_data_tables(from_shed_config=False) diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py index cf058cf069ac..4f019234e142 100644 --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -14,6 +14,7 @@ import zipfile from six import text_type +from six.moves.urllib.request import urlopen from galaxy import util from galaxy.util import compression_utils @@ -39,6 +40,12 @@ def get_test_fname(fname): return full_path +def stream_url_to_file(path): + page = urlopen(path) # page will be .close()ed in stream_to_file + temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + return temp_name + + def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'): """Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor""" # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better diff --git a/lib/galaxy/datatypes/upload_util.py b/lib/galaxy/datatypes/upload_util.py new file mode 100644 index 000000000000..97bb11862ca3 --- /dev/null +++ b/lib/galaxy/datatypes/upload_util.py @@ -0,0 +1,47 @@ +from galaxy.datatypes import sniff +from galaxy.datatypes.binary import Binary + + +class UploadProblemException(Exception): + + def __init__(self, message): + self.message = message + + +def handle_unsniffable_binary_check(data_type, ext, path, name, is_binary, requested_ext, check_content, registry): + """Return modified values of data_type and ext if unsniffable binary encountered. + + Throw UploadProblemException if content problems or extension mismatches occur. + + Precondition: check_binary called returned True. + """ + if is_binary or registry.is_extension_unsniffable_binary(requested_ext): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + parts = name.split(".") + if len(parts) > 1: + ext = parts[-1].strip().lower() + is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) + if check_content and not is_ext_unsniffable_binary: + raise UploadProblemException('The uploaded binary file contains inappropriate content') + + elif is_ext_unsniffable_binary and requested_ext != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) + raise UploadProblemException(err_msg) + return data_type, ext + + +def handle_sniffable_binary_check(data_type, ext, path, registry): + """Return modified values of data_type and ext if sniffable binary encountered. + + Precondition: check_binary called returned True. + """ + # Sniff the data type + guessed_ext = sniff.guess_ext(path, registry.sniff_order) + # Set data_type only if guessed_ext is a binary datatype + datatype = registry.get_datatype_by_extension(guessed_ext) + if isinstance(datatype, Binary): + data_type = guessed_ext + ext = guessed_ext + + return data_type, ext diff --git a/lib/galaxy/dependencies/pinned-requirements.txt b/lib/galaxy/dependencies/pinned-requirements.txt index adddb8247b9f..f5cb58868e9f 100644 --- a/lib/galaxy/dependencies/pinned-requirements.txt +++ b/lib/galaxy/dependencies/pinned-requirements.txt @@ -18,6 +18,7 @@ pysam>=0.13 #python_lzo==1.8 # pure Python packages +bdbag==1.1.1 bz2file==0.98; python_version < '3.3' ipaddress==1.0.18; python_version < '3.3' boltons==17.1.0 diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py index de1c32538bcb..12e638063ac6 100644 --- a/lib/galaxy/jobs/__init__.py +++ b/lib/galaxy/jobs/__init__.py @@ -1371,7 +1371,7 @@ def path_rewriter(path): collected_datasets = { 'primary': self.tool.collect_primary_datasets(out_data, self.get_tool_provided_job_metadata(), tool_working_directory, input_ext, input_dbkey) } - self.tool.collect_dynamic_collections( + self.tool.collect_dynamic_outputs( out_collections, self.get_tool_provided_job_metadata(), job_working_directory=tool_working_directory, diff --git a/lib/galaxy/managers/collections.py b/lib/galaxy/managers/collections.py index e823b0f44ca3..cff0d0cdc241 100644 --- a/lib/galaxy/managers/collections.py +++ b/lib/galaxy/managers/collections.py @@ -46,17 +46,22 @@ def __init__(self, app): self.tag_manager = tags.GalaxyTagManager(app.model.context) self.ldda_manager = lddas.LDDAManager(app) - def precreate_dataset_collection_instance(self, trans, parent, name, implicit_inputs, implicit_output_name, structure): + def precreate_dataset_collection_instance(self, trans, parent, name, structure, implicit_inputs=None, implicit_output_name=None): # TODO: prebuild all required HIDs and send them in so no need to flush in between. - dataset_collection = self.precreate_dataset_collection(structure) + dataset_collection = self.precreate_dataset_collection(structure, allow_unitialized_element=implicit_output_name is not None) instance = self._create_instance_for_collection( trans, parent, name, dataset_collection, implicit_inputs=implicit_inputs, implicit_output_name=implicit_output_name, flush=False ) return instance - def precreate_dataset_collection(self, structure): - if structure.is_leaf or not structure.children_known: - return model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + def precreate_dataset_collection(self, structure, allow_unitialized_element=True): + has_structure = not structure.is_leaf and structure.children_known + if not has_structure and allow_unitialized_element: + dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + elif not has_structure: + collection_type_description = structure.collection_type_description + dataset_collection = model.DatasetCollection(populated=False) + dataset_collection.collection_type = collection_type_description.collection_type else: collection_type_description = structure.collection_type_description dataset_collection = model.DatasetCollection(populated=False) @@ -67,7 +72,7 @@ def precreate_dataset_collection(self, structure): if substructure.is_leaf: element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT else: - element = self.precreate_dataset_collection(substructure) + element = self.precreate_dataset_collection(substructure, allow_unitialized_element=allow_unitialized_element) element = model.DatasetCollectionElement( element=element, @@ -78,7 +83,7 @@ def precreate_dataset_collection(self, structure): dataset_collection.elements = elements dataset_collection.element_count = len(elements) - return dataset_collection + return dataset_collection def create(self, trans, parent, name, collection_type, element_identifiers=None, elements=None, implicit_collection_info=None, trusted_identifiers=None, diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 984dfb883b16..49121faf8463 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -2032,6 +2032,12 @@ def set_file_name(self, filename): return self.dataset.set_file_name(filename) file_name = property(get_file_name, set_file_name) + def link_to(self, path): + self.file_name = os.path.abspath(path) + # Since we are not copying the file into Galaxy's managed + # default file location, the dataset should never be purgable. + self.dataset.purgable = False + @property def extra_files_path(self): return self.dataset.extra_files_path diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index d0d1fcc1d819..a6e478498fcd 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -102,6 +102,7 @@ # Tools that require Galaxy's Python environment to be preserved. GALAXY_LIB_TOOLS_UNVERSIONED = [ "upload1", + "__DATA_FETCH__", # Legacy tools bundled with Galaxy. "vcf_to_maf_customtrack1", "laj_1", @@ -1041,7 +1042,10 @@ def parse_input_elem(self, page_source, enctypes, context=None): group.file_type_name = elem.get('file_type_name', group.file_type_name) group.default_file_type = elem.get('default_file_type', group.default_file_type) group.metadata_ref = elem.get('metadata_ref', group.metadata_ref) - rval[group.file_type_name].refresh_on_change = True + try: + rval[group.file_type_name].refresh_on_change = True + except KeyError: + pass group_page_source = XmlPageSource(elem) group.inputs = self.parse_input_elem(group_page_source, enctypes, context) rval[group.name] = group @@ -1592,10 +1596,10 @@ def collect_primary_datasets(self, output, tool_provided_metadata, job_working_d """ return output_collect.collect_primary_datasets(self, output, tool_provided_metadata, job_working_directory, input_ext, input_dbkey=input_dbkey) - def collect_dynamic_collections(self, output, tool_provided_metadata, **kwds): - """ Find files corresponding to dynamically structured collections. + def collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds): + """Collect dynamic outputs associated with a job from this tool. """ - return output_collect.collect_dynamic_collections(self, output, tool_provided_metadata, **kwds) + return output_collect.collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds) def to_archive(self): tool = self diff --git a/lib/galaxy/tools/actions/upload.py b/lib/galaxy/tools/actions/upload.py index 70bf8152c44a..38f4f210ddeb 100644 --- a/lib/galaxy/tools/actions/upload.py +++ b/lib/galaxy/tools/actions/upload.py @@ -1,5 +1,7 @@ +import json import logging +from galaxy.exceptions import RequestParameterMissingException from galaxy.tools.actions import upload_common from galaxy.util import ExecutionTimer from . import ToolAction @@ -36,3 +38,59 @@ def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, * rval = upload_common.create_job(trans, incoming, tool, json_file_path, data_list, history=history) log.debug("Created upload job %s" % create_job_timer) return rval + + +class FetchUploadToolAction(ToolAction): + + def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, **kwargs): + dataset_upload_inputs = [] + for input_name, input in tool.inputs.items(): + if input.type == "upload_dataset": + dataset_upload_inputs.append(input) + assert dataset_upload_inputs, Exception("No dataset upload groups were found.") + + persisting_uploads_timer = ExecutionTimer() + # precreated_datasets = upload_common.get_precreated_datasets(trans, incoming, trans.app.model.HistoryDatasetAssociation) + incoming = upload_common.persist_uploads(incoming, trans) + log.debug("Persisted uploads %s" % persisting_uploads_timer) + + # Now replace references in requests with these. + files = incoming.get("files", []) + files_iter = iter(files) + request = json.loads(incoming.get("request_json")) + + def replace_file_srcs(request_part): + if isinstance(request_part, dict): + if request_part.get("src", None) == "files": + path_def = next(files_iter) + if path_def is None or path_def["file_data"] is None: + raise RequestParameterMissingException("Failed to find uploaded file matching target with src='files'") + request_part["path"] = path_def["file_data"]["local_filename"] + if "name" not in request_part: + request_part["name"] = path_def["file_data"]["filename"] + request_part["src"] = "path" + else: + for key, value in request_part.items(): + replace_file_srcs(value) + elif isinstance(request_part, list): + for value in request_part: + replace_file_srcs(value) + + replace_file_srcs(request) + + incoming["request_json"] = json.dumps(request) + log.info("incoming are %s" % incoming) + # We can pass an empty string as the cntrller here since it is used to check whether we + # are in an admin view, and this tool is currently not used there. + check_and_cleanup_timer = ExecutionTimer() + # uploaded_datasets = upload_common.get_uploaded_datasets(trans, '', incoming, precreated_datasets, dataset_upload_inputs, history=history) + # upload_common.cleanup_unused_precreated_datasets(precreated_datasets) + + # if not uploaded_datasets: + # return None, 'No data was entered in the upload form, please go back and choose data to upload.' + + log.debug("Checked and cleaned uploads %s" % check_and_cleanup_timer) + create_job_timer = ExecutionTimer() + rval = upload_common.create_job(trans, incoming, tool, None, [], history=history) + log.debug("Created upload job %s" % create_job_timer) + return rval diff --git a/lib/galaxy/tools/actions/upload_common.py b/lib/galaxy/tools/actions/upload_common.py index ae70cae17de3..5857aec9503f 100644 --- a/lib/galaxy/tools/actions/upload_common.py +++ b/lib/galaxy/tools/actions/upload_common.py @@ -16,7 +16,7 @@ from urllib.parse import urlparse from galaxy import datatypes, util -from galaxy.exceptions import ObjectInvalid +from galaxy.exceptions import ConfigDoesNotAllowException, ObjectInvalid from galaxy.managers import tags from galaxy.util import unicodify from galaxy.util.odict import odict @@ -102,7 +102,7 @@ def validate_url(url, ip_whitelist): pass else: # Otherwise, we deny access. - raise Exception("Access to this address in not permitted by server configuration") + raise ConfigDoesNotAllowException("Access to this address in not permitted by server configuration") return url @@ -123,7 +123,7 @@ def persist_uploads(params, trans): local_filename=local_filename) elif type(f) == dict and 'local_filename' not in f: raise Exception('Uploaded file was encoded in a way not understood by Galaxy.') - if upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': + if 'url_paste' in upload_dataset and upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO(validate_url(upload_dataset['url_paste'], trans.app.config.fetch_url_whitelist_ips)), prefix="strio_url_paste_" @@ -334,7 +334,11 @@ def new_upload(trans, cntrller, uploaded_dataset, library_bunch=None, history=No def get_uploaded_datasets(trans, cntrller, params, precreated_datasets, dataset_upload_inputs, library_bunch=None, history=None): uploaded_datasets = [] for dataset_upload_input in dataset_upload_inputs: - uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params)) + try: + uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params)) + except AttributeError: + # TODO: refine... + pass for uploaded_dataset in uploaded_datasets: data = get_precreated_dataset(precreated_datasets, uploaded_dataset.name) if not data: diff --git a/lib/galaxy/tools/data_fetch.py b/lib/galaxy/tools/data_fetch.py new file mode 100644 index 000000000000..0cb87f52485d --- /dev/null +++ b/lib/galaxy/tools/data_fetch.py @@ -0,0 +1,313 @@ +import argparse +import errno +import json +import os +import shutil +import sys +import tempfile + +import bdbag.bdbag_api + +from galaxy.datatypes import sniff +from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) +from galaxy.util import in_directory +from galaxy.util.checkers import ( + check_binary, + check_html, +) +from galaxy.util.compression_utils import CompressedFile + +DESCRIPTION = """Data Import Script""" + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + args = _arg_parser().parse_args(argv) + + registry = Registry() + registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) + + request_path = args.request + assert os.path.exists(request_path) + with open(request_path) as f: + request = json.load(f) + + upload_config = UploadConfig(request, registry) + galaxy_json = _request_to_galaxy_json(upload_config, request) + with open("galaxy.json", "w") as f: + json.dump(galaxy_json, f) + + +def _request_to_galaxy_json(upload_config, request): + targets = request.get("targets", []) + fetched_targets = [] + + for target in targets: + fetched_target = _fetch_target(upload_config, target) + fetched_targets.append(fetched_target) + + return {"__unnamed_outputs": fetched_targets} + + +def _fetch_target(upload_config, target): + destination = target.get("destination", None) + assert destination, "No destination defined." + + def expand_elements_from(target_or_item): + elements_from = target_or_item.get("elements_from", None) + items = None + assert not elements_from or elements_from in ["archive", "bagit", "bagit_archive", "directory"], elements_from + if elements_from == "archive": + decompressed_directory = _decompress_target(target_or_item) + items = _directory_to_items(decompressed_directory) + elif elements_from == "bagit": + _, elements_from_path = _has_src_to_path(target_or_item) + items = _bagit_to_items(elements_from_path) + elif elements_from == "bagit_archive": + decompressed_directory = _decompress_target(target_or_item) + items = _bagit_to_items(decompressed_directory) + elif elements_from == "directory": + _, elements_from_path = _has_src_to_path(target_or_item) + items = _directory_to_items(elements_from_path) + + if items: + del target_or_item["elements_from"] + target_or_item["elements"] = items + + _for_each_src(expand_elements_from, target) + items = target.get("elements", None) + assert items is not None, "No element definition found for destination [%s]" % destination + + fetched_target = {} + fetched_target["destination"] = destination + if "collection_type" in target: + fetched_target["collection_type"] = target["collection_type"] + if "name" in target: + fetched_target["name"] = target["name"] + + def _resolve_src(item): + converted_path = None + + name, path = _has_src_to_path(item) + dbkey = item.get("dbkey", "?") + requested_ext = item.get("ext", "auto") + info = item.get("info", None) + link_data_only = upload_config.link_data_only + if "link_data_only" in item: + # Allow overriding this on a per file basis. + link_data_only = _link_data_only(item) + to_posix_lines = upload_config.get_option(item, "to_posix_lines") + space_to_tab = upload_config.get_option(item, "space_to_tab") + in_place = item.get("in_place", False) + purge_source = item.get("purge_source", True) + + # Follow upload.py logic but without the auto-decompress logic. + registry = upload_config.registry + check_content = upload_config.check_content + data_type, ext = None, requested_ext + + is_binary = check_binary(path) + if is_binary: + data_type, ext = handle_sniffable_binary_check(data_type, ext, path, registry) + if data_type is None: + if is_binary: + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, path, name, is_binary, requested_ext, check_content, registry + ) + if not data_type and check_content and check_html(path): + raise UploadProblemException('The uploaded file contains inappropriate HTML content') + + if data_type != 'binary': + if not link_data_only: + if to_posix_lines: + if space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs(path, in_place=in_place, tmp_dir=".") + else: + line_count, converted_path = sniff.convert_newlines(path, in_place=in_place, tmp_dir=".") + + if requested_ext == 'auto': + ext = sniff.guess_ext(path, registry.sniff_order) + else: + ext = requested_ext + + data_type = ext + + if ext == 'auto' and data_type == 'binary': + ext = 'data' + if ext == 'auto' and requested_ext: + ext = requested_ext + if ext == 'auto': + ext = 'data' + + datatype = registry.get_datatype_by_extension(ext) + if link_data_only: + # Never alter a file that will not be copied to Galaxy's local file store. + if datatype.dataset_content_needs_grooming(path): + err_msg = 'The uploaded files need grooming, so change your Copy data into Galaxy? selection to be ' + \ + 'Copy files into Galaxy instead of Link to files without copying into Galaxy so grooming can be performed.' + raise UploadProblemException(err_msg) + + # If this file is not in the workdir make sure it gets there. + if not link_data_only and converted_path: + path = upload_config.ensure_in_working_directory(converted_path, purge_source, in_place) + elif not link_data_only: + path = upload_config.ensure_in_working_directory(path, purge_source, in_place) + + if not link_data_only and datatype and datatype.dataset_content_needs_grooming(path): + # Groom the dataset content if necessary + datatype.groom_dataset_content(path) + + rval = {"name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only} + if info is not None: + rval["info"] = info + return rval + + elements = elements_tree_map(_resolve_src, items) + + fetched_target["elements"] = elements + return fetched_target + + +def _bagit_to_items(directory): + bdbag.bdbag_api.resolve_fetch(directory) + bdbag.bdbag_api.validate_bag(directory) + items = _directory_to_items(os.path.join(directory, "data")) + return items + + +def _decompress_target(target): + elements_from_name, elements_from_path = _has_src_to_path(target) + temp_directory = tempfile.mkdtemp(prefix=elements_from_name, dir=".") + decompressed_directory = CompressedFile(elements_from_path).extract(temp_directory) + return decompressed_directory + + +def elements_tree_map(f, items): + new_items = [] + for item in items: + if "elements" in item: + new_item = item.copy() + new_item["elements"] = elements_tree_map(f, item["elements"]) + new_items.append(new_item) + else: + new_items.append(f(item)) + return new_items + + +def _directory_to_items(directory): + items = [] + dir_elements = {} + for root, dirs, files in os.walk(directory): + if root in dir_elements: + target = dir_elements[root] + else: + target = items + for dir in dirs: + dir_dict = {"name": dir, "elements": []} + dir_elements[os.path.join(root, dir)] = dir_dict["elements"] + target.append(dir_dict) + for file in files: + target.append({"src": "path", "path": os.path.join(root, file)}) + + return items + + +def _has_src_to_path(item): + assert "src" in item, item + src = item.get("src") + name = item.get("name") + if src == "url": + url = item.get("url") + path = sniff.stream_url_to_file(url) + if name is None: + name = url.split("/")[-1] + else: + assert src == "path" + path = item["path"] + if name is None: + name = os.path.basename(path) + return name, path + + +def _arg_parser(): + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument("--galaxy-root") + parser.add_argument("--datatypes-registry") + parser.add_argument("--request-version") + parser.add_argument("--request") + return parser + + +class UploadConfig(object): + + def __init__(self, request, registry): + self.registry = registry + self.check_content = request.get("check_content" , True) + self.to_posix_lines = request.get("to_posix_lines", False) + self.space_to_tab = request.get("space_to_tab", False) + self.link_data_only = _link_data_only(request) + + self.__workdir = os.path.abspath(".") + self.__upload_count = 0 + + def get_option(self, item, key): + """Return item[key] if specified otherwise use default from UploadConfig. + + This default represents the default for the whole request instead item which + is the option for individual files. + """ + if key in item: + return item[key] + else: + return getattr(self, key) + + def __new_dataset_path(self): + path = "gxupload_%d" % self.__upload_count + self.__upload_count += 1 + return path + + def ensure_in_working_directory(self, path, purge_source, in_place): + if in_directory(path, self.__workdir): + return path + + new_path = self.__new_dataset_path() + if purge_source: + try: + shutil.move(path, new_path) + except OSError as e: + # We may not have permission to remove converted_path + if e.errno != errno.EACCES: + raise + else: + shutil.copy(path, new_path) + + return new_path + + +def _link_data_only(has_config_dict): + link_data_only = has_config_dict.get("link_data_only", False) + if not isinstance(link_data_only, bool): + # Allow the older string values of 'copy_files' and 'link_to_files' + link_data_only = link_data_only == "copy_files" + return link_data_only + + +def _for_each_src(f, obj): + if isinstance(obj, list): + for item in obj: + _for_each_src(f, item) + if isinstance(obj, dict): + if "src" in obj: + f(obj) + for key, value in obj.items(): + _for_each_src(f, value) + + +if __name__ == "__main__": + main() diff --git a/lib/galaxy/tools/data_fetch.xml b/lib/galaxy/tools/data_fetch.xml new file mode 100644 index 000000000000..5160cb2989c8 --- /dev/null +++ b/lib/galaxy/tools/data_fetch.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + $request_json + + + + + diff --git a/lib/galaxy/tools/execute.py b/lib/galaxy/tools/execute.py index 880bc4a38c30..a1dc7e950f4d 100644 --- a/lib/galaxy/tools/execute.py +++ b/lib/galaxy/tools/execute.py @@ -277,9 +277,9 @@ def precreate_output_collections(self, history, params): trans=trans, parent=history, name=output_collection_name, + structure=effective_structure, implicit_inputs=implicit_inputs, implicit_output_name=output_name, - structure=effective_structure, ) collection_instance.implicit_collection_jobs = implicit_collection_jobs collection_instances[output_name] = collection_instance diff --git a/lib/galaxy/tools/parameters/output_collect.py b/lib/galaxy/tools/parameters/output_collect.py index a452d5805125..88b950970bbf 100644 --- a/lib/galaxy/tools/parameters/output_collect.py +++ b/lib/galaxy/tools/parameters/output_collect.py @@ -9,9 +9,11 @@ from collections import namedtuple from galaxy import util +from galaxy.dataset_collections.structure import UnitializedTree from galaxy.tools.parser.output_collection_def import ( DEFAULT_DATASET_COLLECTOR_DESCRIPTION, INPUT_DBKEY_TOKEN, + ToolProvidedMetadataDatasetCollection, ) from galaxy.util import ( ExecutionTimer, @@ -34,6 +36,9 @@ def get_new_dataset_meta_by_basename(self, output_name, basename): def has_failed_outputs(self): return False + def get_unnamed_outputs(self): + return [] + class LegacyToolProvidedMetadata(object): @@ -84,6 +89,9 @@ def has_failed_outputs(self): return found_failed + def get_unnamed_outputs(self): + return [] + class ToolProvidedMetadata(object): @@ -124,14 +132,21 @@ def _elements_to_datasets(self, elements, level=0): def has_failed_outputs(self): found_failed = False - for meta in self.tool_provided_job_metadata.values(): + for output_name, meta in self.tool_provided_job_metadata.items(): + if output_name == "__unnamed_outputs": + continue + if meta.get("failed", False): found_failed = True return found_failed + def get_unnamed_outputs(self): + log.debug("unnamed outputs [%s]" % self.tool_provided_job_metadata) + return self.tool_provided_job_metadata.get("__unnamed_outputs", []) -def collect_dynamic_collections( + +def collect_dynamic_outputs( tool, output_collections, tool_provided_metadata, @@ -140,6 +155,7 @@ def collect_dynamic_collections( job=None, input_dbkey="?", ): + app = tool.app collections_service = tool.app.dataset_collections_service job_context = JobContext( tool, @@ -149,6 +165,88 @@ def collect_dynamic_collections( inp_data, input_dbkey, ) + log.info(tool_provided_metadata) + for unnamed_output_dict in tool_provided_metadata.get_unnamed_outputs(): + assert "destination" in unnamed_output_dict + assert "elements" in unnamed_output_dict + destination = unnamed_output_dict["destination"] + elements = unnamed_output_dict["elements"] + + assert "type" in destination + destination_type = destination["type"] + trans = job_context.work_context + + if destination_type == "library_folder": + + library_folder_manager = app.library_folder_manager + library_folder = library_folder_manager.get(trans, app.security.decode_id(destination.get("library_folder_id"))) + + def add_elements_to_folder(elements, library_folder): + for element in elements: + if "elements" in element: + assert "name" in element + name = element["name"] + description = element.get("description") + nested_folder = library_folder_manager.create(trans, library_folder.id, name, description) + add_elements_to_folder(element["elements"], nested_folder) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory) + fields_match = discovered_file.match + designation = fields_match.designation + visible = fields_match.visible + ext = fields_match.ext + dbkey = fields_match.dbkey + info = element.get("info", None) + link_data = discovered_file.match.link_data + + # Create new primary dataset + name = fields_match.name or designation + + job_context.create_dataset( + ext=ext, + designation=designation, + visible=visible, + dbkey=dbkey, + name=name, + filename=discovered_file.path, + info=info, + library_folder=library_folder, + link_data=link_data + ) + + add_elements_to_folder(elements, library_folder) + elif destination_type == "hdca": + history = job.history + assert "collection_type" in unnamed_output_dict + name = unnamed_output_dict.get("name", "unnamed collection") + collection_type = unnamed_output_dict["collection_type"] + collection_type_description = collections_service.collection_type_descriptions.for_collection_type(collection_type) + structure = UnitializedTree(collection_type_description) + hdca = collections_service.precreate_dataset_collection_instance( + trans, history, name, structure=structure + ) + filenames = odict.odict() + + def add_to_discovered_files(elements, parent_identifiers=[]): + for element in elements: + if "elements" in element: + add_to_discovered_files(element["elements"], parent_identifiers + [element["name"]]) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory, parent_identifiers) + filenames[discovered_file.path] = discovered_file + + add_to_discovered_files(elements) + + collection = hdca.collection + collection_builder = collections_service.collection_builder_for( + collection + ) + job_context.populate_collection_elements( + collection, + collection_builder, + filenames, + ) + collection_builder.populate() for name, has_collection in output_collections.items(): if name not in tool.output_collections: @@ -165,13 +263,19 @@ def collect_dynamic_collections( collection = has_collection try: + collection_builder = collections_service.collection_builder_for( collection ) + dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) + output_name = output_collection_def.name + filenames = job_context.find_files(output_name, collection, dataset_collectors) job_context.populate_collection_elements( collection, collection_builder, - output_collection_def, + filenames, + name=output_collection_def.name, + metadata_source_name=output_collection_def.metadata_source, ) collection_builder.populate() except Exception: @@ -190,6 +294,11 @@ def __init__(self, tool, tool_provided_metadata, job, job_working_directory, inp self.job_working_directory = job_working_directory self.tool_provided_metadata = tool_provided_metadata + @property + def work_context(self): + from galaxy.work.context import WorkRequestContext + return WorkRequestContext(self.app, user=self.job.user) + @property def permissions(self): inp_data = self.inp_data @@ -207,15 +316,14 @@ def find_files(self, output_name, collection, dataset_collectors): filenames[discovered_file.path] = discovered_file return filenames - def populate_collection_elements(self, collection, root_collection_builder, output_collection_def): + def populate_collection_elements(self, collection, root_collection_builder, filenames, name=None, metadata_source_name=None): # TODO: allow configurable sorting. # # # # - dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) - output_name = output_collection_def.name - filenames = self.find_files(output_name, collection, dataset_collectors) + if name is None: + name = "unnamed output" element_datasets = [] for filename, discovered_file in filenames.items(): @@ -234,6 +342,8 @@ def populate_collection_elements(self, collection, root_collection_builder, outp # Create new primary dataset name = fields_match.name or designation + link_data = discovered_file.match.link_data + dataset = self.create_dataset( ext=ext, designation=designation, @@ -241,14 +351,15 @@ def populate_collection_elements(self, collection, root_collection_builder, outp dbkey=dbkey, name=name, filename=filename, - metadata_source_name=output_collection_def.metadata_source, + metadata_source_name=metadata_source_name, + link_data=link_data, ) log.debug( "(%s) Created dynamic collection dataset for path [%s] with element identifier [%s] for output [%s] %s", self.job.id, filename, designation, - output_collection_def.name, + name, create_dataset_timer, ) element_datasets.append((element_identifiers, dataset)) @@ -263,7 +374,7 @@ def populate_collection_elements(self, collection, root_collection_builder, outp log.debug( "(%s) Add dynamic collection datsets to history for output [%s] %s", self.job.id, - output_collection_def.name, + name, add_datasets_timer, ) @@ -293,12 +404,18 @@ def create_dataset( dbkey, name, filename, - metadata_source_name, + metadata_source_name=None, + info=None, + library_folder=None, + link_data=False, ): app = self.app sa_session = self.sa_session - primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + if not library_folder: + primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + else: + primary_data = _new_ldda(self.work_context, name, ext, visible, dbkey, library_folder) # Copy metadata from one of the inputs if requested. metadata_source = None @@ -307,7 +424,11 @@ def create_dataset( sa_session.flush() # Move data from temp location to dataset location - app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) + if not link_data: + app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) + else: + primary_data.link_to(filename) + primary_data.set_size() # If match specified a name use otherwise generate one from # designation. @@ -318,6 +439,9 @@ def create_dataset( else: primary_data.init_meta() + if info is not None: + primary_data.info = info + primary_data.set_meta() primary_data.set_peek() @@ -484,6 +608,20 @@ def discover_files(output_name, tool_provided_metadata, extra_file_collectors, j yield DiscoveredFile(match.path, collector, match) +def discovered_file_for_unnamed_output(dataset, job_working_directory, parent_identifiers=[]): + extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR + target_directory = discover_target_directory(extra_file_collector, job_working_directory) + filename = dataset["filename"] + # handle link_data_only here, verify filename is in directory if not linking... + if not dataset.get("link_data_only"): + path = os.path.join(target_directory, filename) + if not util.in_directory(target_directory, path): + raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.") + else: + path = filename + return DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path, parent_identifiers=parent_identifiers)) + + def discover_target_directory(extra_file_collector, job_working_directory): directory = job_working_directory if extra_file_collector.directory: @@ -585,11 +723,12 @@ def _compose(f, g): class JsonCollectedDatasetMatch(object): - def __init__(self, as_dict, collector, filename, path=None): + def __init__(self, as_dict, collector, filename, path=None, parent_identifiers=[]): self.as_dict = as_dict self.collector = collector self.filename = filename self.path = path + self._parent_identifiers = parent_identifiers @property def designation(self): @@ -607,7 +746,7 @@ def designation(self): @property def element_identifiers(self): - return self.raw_element_identifiers or [self.designation] + return self._parent_identifiers + (self.raw_element_identifiers or [self.designation]) @property def raw_element_identifiers(self): @@ -644,6 +783,10 @@ def visible(self): except KeyError: return self.collector.default_visible + @property + def link_data(self): + return bool(self.as_dict.get("link_data_only", False)) + class RegexCollectedDatasetMatch(JsonCollectedDatasetMatch): @@ -656,6 +799,42 @@ def __init__(self, re_match, collector, filename, path=None): UNSET = object() +def _new_ldda( + trans, + name, + ext, + visible, + dbkey, + library_folder, +): + ld = trans.app.model.LibraryDataset(folder=library_folder, name=name) + trans.sa_session.add(ld) + trans.sa_session.flush() + trans.app.security_agent.copy_library_permissions(trans, library_folder, ld) + + ldda = trans.app.model.LibraryDatasetDatasetAssociation(name=name, + extension=ext, + dbkey=dbkey, + library_dataset=ld, + user=trans.user, + create_dataset=True, + sa_session=trans.sa_session) + trans.sa_session.add(ldda) + ldda.state = ldda.states.OK + # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset + trans.app.security_agent.copy_library_permissions(trans, ld, ldda) + # Copy the current user's DefaultUserPermissions to the new LibraryDatasetDatasetAssociation.dataset + trans.app.security_agent.set_all_dataset_permissions(ldda.dataset, trans.app.security_agent.user_get_default_permissions(trans.user)) + library_folder.add_library_dataset(ld, genome_build=dbkey) + trans.sa_session.add(library_folder) + trans.sa_session.flush() + + ld.library_dataset_dataset_association_id = ldda.id + trans.sa_session.add(ld) + trans.sa_session.flush() + return ldda + + def _new_hda( app, sa_session, @@ -682,3 +861,4 @@ def _new_hda( DEFAULT_DATASET_COLLECTOR = DatasetCollector(DEFAULT_DATASET_COLLECTOR_DESCRIPTION) +DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR = ToolMetadataDatasetCollector(ToolProvidedMetadataDatasetCollection()) diff --git a/lib/galaxy/tools/special_tools.py b/lib/galaxy/tools/special_tools.py index 953e69dee647..129b7064a941 100644 --- a/lib/galaxy/tools/special_tools.py +++ b/lib/galaxy/tools/special_tools.py @@ -4,6 +4,7 @@ SPECIAL_TOOLS = { "history export": "galaxy/tools/imp_exp/exp_history_to_archive.xml", "history import": "galaxy/tools/imp_exp/imp_history_from_archive.xml", + "data fetch": "galaxy/tools/data_fetch.xml", } diff --git a/lib/galaxy/webapps/galaxy/api/_fetch_util.py b/lib/galaxy/webapps/galaxy/api/_fetch_util.py new file mode 100644 index 000000000000..6630dfd79a93 --- /dev/null +++ b/lib/galaxy/webapps/galaxy/api/_fetch_util.py @@ -0,0 +1,205 @@ +import logging +import os + +from galaxy.actions.library import ( + validate_path_upload, + validate_server_directory_upload, +) +from galaxy.exceptions import ( + RequestParameterInvalidException +) +from galaxy.tools.actions.upload_common import validate_url +from galaxy.util import ( + relpath, +) + +log = logging.getLogger(__name__) + +VALID_DESTINATION_TYPES = ["library", "library_folder", "hdca"] +ELEMENTS_FROM_TYPE = ["archive", "bagit", "bagit_archive", "directory"] +# These elements_from cannot be sym linked to because they only exist during upload. +ELEMENTS_FROM_TRANSIENT_TYPES = ["archive", "bagit_archive"] + + +def validate_and_normalize_targets(trans, payload): + """Validate and normalize all src references in fetch targets. + + - Normalize ftp_import and server_dir src entries into simple path entires + with the relevant paths resolved and permissions / configuration checked. + - Check for file:// URLs in items src of "url" and convert them into path + src items - after verifying path pastes are allowed and user is admin. + - Check for valid URLs to be fetched for http and https entries. + - Based on Galaxy configuration and upload types set purge_source and in_place + as needed for each upload. + """ + targets = payload.get("targets", []) + + for target in targets: + destination = _get_required_item(target, "destination", "Each target must specify a 'destination'") + destination_type = _get_required_item(destination, "type", "Each target destination must specify a 'type'") + if destination_type not in VALID_DESTINATION_TYPES: + template = "Invalid target destination type [%s] encountered, must be one of %s" + msg = template % (destination_type, VALID_DESTINATION_TYPES) + raise RequestParameterInvalidException(msg) + if destination_type == "library": + library_name = _get_required_item(destination, "name", "Must specify a library name") + description = destination.get("description", "") + synopsis = destination.get("synopsis", "") + library = trans.app.library_manager.create( + trans, library_name, description=description, synopsis=synopsis + ) + destination["type"] = "library_folder" + for key in ["name", "description", "synopsis"]: + if key in destination: + del destination[key] + destination["library_folder_id"] = trans.app.security.encode_id(library.root_folder.id) + + # Unlike upload.py we don't transmit or use run_as_real_user in the job - we just make sure + # in_place and purge_source are set on the individual upload fetch sources as needed based + # on this. + run_as_real_user = trans.app.config.external_chown_script is None # See comment in upload.py + purge_ftp_source = getattr(trans.app.config, 'ftp_upload_purge', True) and not run_as_real_user + + payload["check_content"] = trans.app.config.check_upload_content + + def check_src(item): + # Normalize file:// URLs into paths. + if item["src"] == "url" and item["url"].startswith("file://"): + item["src"] = "path" + item["path"] = item["url"][len("file://"):] + del item["path"] + + if "in_place" in item: + raise RequestParameterInvalidException("in_place cannot be set in the upload request") + + src = item["src"] + + # Check link_data_only can only be set for certain src types and certain elements_from types. + _handle_invalid_link_data_only_elements_type(item) + if src not in ["path", "server_dir"]: + _handle_invalid_link_data_only_type(item) + elements_from = item.get("elements_from", None) + if elements_from and elements_from not in ELEMENTS_FROM_TYPE: + raise RequestParameterInvalidException("Invalid elements_from/items_from found in request") + + if src == "path" or (src == "url" and item["url"].startswith("file:")): + # Validate is admin, leave alone. + validate_path_upload(trans) + elif src == "server_dir": + # Validate and replace with path definition. + server_dir = item["server_dir"] + full_path, _ = validate_server_directory_upload(trans, server_dir) + item["src"] = "path" + item["path"] = full_path + elif src == "ftp_import": + ftp_path = item["ftp_path"] + full_path = None + + # It'd be nice if this can be de-duplicated with what is in parameters/grouping.py. + user_ftp_dir = trans.user_ftp_dir + is_directory = False + + assert not os.path.islink(user_ftp_dir), "User FTP directory cannot be a symbolic link" + for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): + for filename in filenames: + if ftp_path == filename: + path = relpath(os.path.join(dirpath, filename), user_ftp_dir) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = os.path.abspath(os.path.join(user_ftp_dir, path)) + break + + for dirname in dirnames: + if ftp_path == dirname: + path = relpath(os.path.join(dirpath, dirname), user_ftp_dir) + if not os.path.islink(os.path.join(dirpath, dirname)): + full_path = os.path.abspath(os.path.join(user_ftp_dir, path)) + is_directory = True + break + + if is_directory: + # If the target is a directory - make sure no files under it are symbolic links + for (dirpath, dirnames, filenames) in os.walk(full_path): + for filename in filenames: + if ftp_path == filename: + path = relpath(os.path.join(dirpath, filename), full_path) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = False + break + + for dirname in dirnames: + if ftp_path == dirname: + path = relpath(os.path.join(dirpath, filename), full_path) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = False + break + + if not full_path: + raise RequestParameterInvalidException("Failed to find referenced ftp_path or symbolic link was enountered") + + item["src"] = "path" + item["path"] = full_path + item["purge_source"] = purge_ftp_source + elif src == "url": + url = item["url"] + looks_like_url = False + for url_prefix in ["http://", "https://", "ftp://", "ftps://"]: + if url.startswith(url_prefix): + looks_like_url = True + break + + if not looks_like_url: + raise RequestParameterInvalidException("Invalid URL [%s] found in src definition." % url) + + validate_url(url, trans.app.config.fetch_url_whitelist_ips) + item["in_place"] = run_as_real_user + elif src == "files": + item["in_place"] = run_as_real_user + + _replace_request_syntax_sugar(targets) + _for_each_src(check_src, targets) + + +def _replace_request_syntax_sugar(obj): + # For data libraries and hdas to make sense - allow items and items_from in place of elements + # and elements_from. This is destructive and modifies the supplied request. + if isinstance(obj, list): + for el in obj: + _replace_request_syntax_sugar(el) + elif isinstance(obj, dict): + if "items" in obj: + obj["elements"] = obj["items"] + del obj["items"] + if "items_from" in obj: + obj["elements_from"] = obj["items_from"] + del obj["items_from"] + for value in obj.values(): + _replace_request_syntax_sugar(value) + + +def _handle_invalid_link_data_only_type(item): + link_data_only = item.get("link_data_only", False) + if link_data_only: + raise RequestParameterInvalidException("link_data_only is invalid for src type [%s]" % item.get("src")) + + +def _handle_invalid_link_data_only_elements_type(item): + link_data_only = item.get("link_data_only", False) + if link_data_only and item.get("elements_from", False) in ELEMENTS_FROM_TRANSIENT_TYPES: + raise RequestParameterInvalidException("link_data_only is invalid for derived elements from [%s]" % item.get("elements_from")) + + +def _get_required_item(from_dict, key, message): + if key not in from_dict: + raise RequestParameterInvalidException(message) + return from_dict[key] + + +def _for_each_src(f, obj): + if isinstance(obj, list): + for item in obj: + _for_each_src(f, item) + if isinstance(obj, dict): + if "src" in obj: + f(obj) + for key, value in obj.items(): + _for_each_src(f, value) diff --git a/lib/galaxy/webapps/galaxy/api/tools.py b/lib/galaxy/webapps/galaxy/api/tools.py index 412cc4082d89..107a090b03e8 100644 --- a/lib/galaxy/webapps/galaxy/api/tools.py +++ b/lib/galaxy/webapps/galaxy/api/tools.py @@ -12,9 +12,14 @@ from galaxy.web import _future_expose_api_anonymous_and_sessionless as expose_api_anonymous_and_sessionless from galaxy.web.base.controller import BaseAPIController from galaxy.web.base.controller import UsesVisualizationMixin +from ._fetch_util import validate_and_normalize_targets log = logging.getLogger(__name__) +# Do not allow these tools to be called directly - they (it) enforces extra security and +# provides access via a different API endpoint. +PROTECTED_TOOLS = ["__DATA_FETCH__"] + class ToolsController(BaseAPIController, UsesVisualizationMixin): """ @@ -290,12 +295,52 @@ def download(self, trans, id, **kwds): trans.response.headers["Content-Disposition"] = 'attachment; filename="%s.tgz"' % (id) return download_file + @expose_api_anonymous + def fetch(self, trans, payload, **kwd): + """Adapt clean API to tool-constrained API. + """ + log.info("Keywords are %s" % payload) + request_version = '1' + history_id = payload.pop("history_id") + clean_payload = {} + files_payload = {} + for key, value in payload.items(): + if key == "key": + continue + if key.startswith('files_') or key.startswith('__files_'): + files_payload[key] = value + continue + clean_payload[key] = value + log.info("payload %s" % clean_payload) + validate_and_normalize_targets(trans, clean_payload) + clean_payload["check_content"] = trans.app.config.check_upload_content + request = dumps(clean_payload) + log.info(request) + create_payload = { + 'tool_id': "__DATA_FETCH__", + 'history_id': history_id, + 'inputs': { + 'request_version': request_version, + 'request_json': request, + }, + } + create_payload.update(files_payload) + return self._create(trans, create_payload, **kwd) + @expose_api_anonymous def create(self, trans, payload, **kwd): """ POST /api/tools Executes tool using specified inputs and returns tool's outputs. """ + tool_id = payload.get("tool_id") + if tool_id in PROTECTED_TOOLS: + raise exceptions.RequestParameterInvalidException("Cannot execute tool [%s] directly, must use alternative endpoint." % tool_id) + if tool_id is None: + raise exceptions.RequestParameterInvalidException("Must specify a valid tool_id to use this endpoint.") + return self._create(trans, payload, **kwd) + + def _create(self, trans, payload, **kwd): # HACK: for now, if action is rerun, rerun tool. action = payload.get('action', None) if action == 'rerun': diff --git a/lib/galaxy/webapps/galaxy/buildapp.py b/lib/galaxy/webapps/galaxy/buildapp.py index fa14db1e8063..7a9a60bf91bd 100644 --- a/lib/galaxy/webapps/galaxy/buildapp.py +++ b/lib/galaxy/webapps/galaxy/buildapp.py @@ -268,6 +268,7 @@ def populate_api_routes(webapp, app): # ====== TOOLS API ====== # ======================= + webapp.mapper.connect('/api/tools/fetch', action='fetch', controller='tools', conditions=dict(method=["POST"])) webapp.mapper.connect('/api/tools/all_requirements', action='all_requirements', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/build', action='build', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/reload', action='reload', controller="tools") diff --git a/scripts/api/fetch_to_library.py b/scripts/api/fetch_to_library.py new file mode 100644 index 000000000000..6c497bcb402b --- /dev/null +++ b/scripts/api/fetch_to_library.py @@ -0,0 +1,33 @@ +import argparse +import json + +import requests +import yaml + + +def main(): + parser = argparse.ArgumentParser(description='Upload a directory into a data library') + parser.add_argument("-u", "--url", dest="url", required=True, help="Galaxy URL") + parser.add_argument("-a", "--api", dest="api_key", required=True, help="API Key") + parser.add_argument('target', metavar='FILE', type=str, + help='file describing data library to fetch') + args = parser.parse_args() + with open(args.target, "r") as f: + target = yaml.load(f) + + histories_url = args.url + "/api/histories" + new_history_response = requests.post(histories_url, data={'key': args.api_key}) + + fetch_url = args.url + '/api/tools/fetch' + payload = { + 'key': args.api_key, + 'targets': json.dumps([target]), + 'history_id': new_history_response.json()["id"] + } + + response = requests.post(fetch_url, data=payload) + print(response.content) + + +if __name__ == '__main__': + main() diff --git a/scripts/api/fetch_to_library_example.yml b/scripts/api/fetch_to_library_example.yml new file mode 100644 index 000000000000..44bc35ef43b5 --- /dev/null +++ b/scripts/api/fetch_to_library_example.yml @@ -0,0 +1,42 @@ +destination: + type: library + name: Training Material + description: Data for selected tutorials from https://training.galaxyproject.org. +items: + - name: Quality Control + description: | + Data for sequence quality control tutorial at http://galaxyproject.github.io/training-material/topics/sequence-analysis/tutorials/quality-control/tutorial.html. + + 10.5281/zenodo.61771 + items: + - src: url + url: https://zenodo.org/record/61771/files/GSM461178_untreat_paired_subset_1.fastq + name: GSM461178_untreat_paired_subset_1 + ext: fastqsanger + info: Untreated subseq of GSM461178 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/61771/files/GSM461182_untreat_single_subset.fastq + name: GSM461182_untreat_single_subset + ext: fastqsanger + info: Untreated subseq of GSM461182 from 10.1186/s12864-017-3692-8 + - name: Small RNA-Seq + description: | + Data for small RNA-seq tutorial available at http://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/srna/tutorial.html + + 10.5281/zenodo.826906 + items: + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep1_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep1 + ext: fastqsanger.gz + info: Downsample rep1 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep2_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep2 + ext: fastqsanger.gz + info: Downsample rep2 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep3_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep3 + ext: fastqsanger.gz + info: Downsample rep3 from 10.1186/s12864-017-3692-8 diff --git a/test-data/example-bag.zip b/test-data/example-bag.zip new file mode 100644 index 0000000000000000000000000000000000000000..ee4de52c265be0c5ddfe54ade610957580115dac GIT binary patch literal 2966 zcmWIWW@h1H00G^m9&a!MN{BGXFqEVgm*^%Xrt7AqmLzBBW|Wi^=!b@IGBBsQ8$>Ph zG>9s#;AUWC`Nqh=z#;mYgs!XZ3B?^0f-fFm`Zt`8A z>1ZbBp7rKCq#(fdpw|B&E8D7L;)ZH=W_MR~s?QMXl4xXGwsP;@9}&Ce%>4g9t)Fe( z!ljIh+vZ-Jr?NV+JHQ+@O#^gzJP~5vR?8p#Z|NxI-ed zgitUzC8m3p=!T^h6=&w>St%IkS(v7HxNQv;}(335b>P85!j2=;G^&;>}FV z*p@wglDENthvh)nHl|ez8)f)Dsyn{^+WgMZs?#I zn&~Pev>=Ak0n7%S;31$=noE6!&7uF}kM> z`I-%Q7!J%0|IipDbGz`GmqFP>_ClHPwOS4n7M@Rk`Br?Tl(JE<=9yU=SQvU)v%PeZ zUsRvgy+8e@q3!d9f1}(FO*v@T^Te|`R83Kk;o}LpxAs>|_UHZH`15yk`}v(~@3Nx$ zVE(J~2^~P!uK{9pLOv+YNHj7vBj|}U)_jM6p181g`3qK!JehhUrHd0=r7a4396FR< z9Qc@+Q2F@q&7E0oN;^G*Ro5Ii+I#4r>cmM7iwx3&^Cvbf_?x#kbRbe=f3^ zH!bsb4l~oVyhxKwcAvXJC2rFumE^G;x)7<|xqy>>Wy0xeM`yKrE&OR8w!ZA9ZG3jj z``TT5uOI(_=25AY$@%kuo_vkRqa}&yq{No*S;K8ih8(RA|E*0jTcAE+LgP~g#<@|&pGu@?0@;X;NXNX1&_rZW-snEx@pSYJ(gCrTjqRC?DO=0o)h>IJB$KKEZ)tD z4V<-c@vgep$@4N##7)jVnXu&41_|cU*lOQNy2-!(`gHTgwy(Xq*NMUH+6SeH4=zk9 zu72_PM8)~j$rm(dEGiUV<#_i1%VFkIPRv_x{8PSH?)pmo7g|bHUa&4=cI05WaA)}i);Eug4GLru*t{E`I7t8a5vN&r@9VBz!Dp^6;7HVT zRN3)%sxXWDE53$Xc_us7KL57y<`R*{30H4jQ+@pV^S2lC*FXOL<3`_G=HhPuck^!S zSXHFUvrIKFE#sWw>^Q|MD`qr@Z%|$J(|fmU{;3P9;!ga13gJGIA_1%SOx)#|QlusB z)9?6Z;*{Xs)@6#9mwpetZhrOLs;h1*85Fl(ni6&~=DsX2xd=!u_cA9I`DDK+CYXA7~4#)+1~MX61z(M9*+o0jfu@Sb67|*c+&0%2R0zxKWSkt@^7^NRoTEWf0$nuSm zfq_K?s5k&_ggBZJe8@&1%;5u?LC~B=amj9Opy?nijA2e%W=^Ux*ij(E1TY;{T$%*5 z5TuL?;XV$a`xFeU%-Dctf-s8vKxz%aMt%VL^CJ+W`7=NdcP1$Rx*%D*_~dZUg}VhPRF&8YwzhA<=>6L}cSIBLmquXJq4` zQ3Es+6q*FgM2#4TnZTsLu%xjY!%S#25jGezW{?ejjBGG8o`7b8;t7vqG2;i>OhcfX g;o%7~6Bt`SGeNP1VJ0gmiWyjea0yUXJ;=8V0D^6pPXGV_ literal 0 HcmV?d00001 diff --git a/test/api/test_dataset_collections.py b/test/api/test_dataset_collections.py index ca8950d8bb0c..87752d2b9096 100644 --- a/test/api/test_dataset_collections.py +++ b/test/api/test_dataset_collections.py @@ -188,6 +188,78 @@ def test_enforces_unique_names(self): create_response = self._post("dataset_collections", payload) self._assert_status_code_is(create_response, 400) + def test_upload_collection(self): + elements = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + "name": "Test upload", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + self.assertEquals(hdca["name"], "Test upload") + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "4.bed" + assert element0["object"]["file_size"] == 61 + + def test_upload_nested(self): + elements = [{"name": "samp1", "elements": [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}]}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list:list", + "name": "Test upload", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + self.assertEquals(hdca["name"], "Test upload") + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "samp1" + + def test_upload_collection_from_url(self): + elements = [{"src": "url", "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed", "info": "my cool bed"}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "4.bed" + assert element0["object"]["file_size"] == 61 + + def _assert_one_collection_created_in_history(self): + contents_response = self._get("histories/%s/contents/dataset_collections" % self.history_id) + self._assert_status_code_is(contents_response, 200) + contents = contents_response.json() + assert len(contents) == 1 + hdca = contents[0] + assert hdca["history_content_type"] == "dataset_collection" + hdca_id = hdca["id"] + collection_response = self._get("histories/%s/contents/dataset_collections/%s" % (self.history_id, hdca_id)) + self._assert_status_code_is(collection_response, 200) + return collection_response.json() + def _check_create_response(self, create_response): self._assert_status_code_is(create_response, 200) dataset_collection = create_response.json() diff --git a/test/api/test_libraries.py b/test/api/test_libraries.py index 2a715f50fcbb..ae0303a8074f 100644 --- a/test/api/test_libraries.py +++ b/test/api/test_libraries.py @@ -1,3 +1,5 @@ +import json + from base import api from base.populators import ( DatasetCollectionPopulator, @@ -95,6 +97,94 @@ def test_create_dataset(self): assert library_dataset["peek"].find("create_test") >= 0 assert library_dataset["file_ext"] == "txt", library_dataset["file_ext"] + def test_fetch_upload_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + items = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert dataset["genome_build"] == "hg19", dataset + assert dataset["misc_info"] == "my cool bed", dataset + assert dataset["file_ext"] == "bed", dataset + + def test_fetch_zip_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "files", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(bed_test_data_path)} + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_single_url_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + items = [{"src": "url", "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_url_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + targets = [{ + "destination": destination, + "items_from": "archive", + "src": "url", + "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed.zip", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_bagit_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("bagit_archive") + example_bag_path = self.test_data_resolver.get_filename("example-bag.zip") + targets = [{ + "destination": destination, + "items_from": "bagit_archive", "src": "files", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(example_bag_path)}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/README.txt") + assert dataset["file_size"] == 66, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/bdbag-profile.json") + assert dataset["file_size"] == 723, dataset + + def _setup_fetch_to_folder(self, test_name): + return self.library_populator.setup_fetch_to_folder(test_name) + def test_create_dataset_in_folder(self): library = self.library_populator.new_private_library("ForCreateDatasets") folder_response = self._create_folder(library) diff --git a/test/base/integration_util.py b/test/base/integration_util.py index 4339436a9b4a..8910f9b82996 100644 --- a/test/base/integration_util.py +++ b/test/base/integration_util.py @@ -7,6 +7,7 @@ import os from unittest import skip, TestCase +from galaxy.tools.verify.test_data import TestDataResolver from .api import UsesApiTestCaseMixin from .driver_util import GalaxyTestDriver @@ -45,6 +46,7 @@ def tearDownClass(cls): cls._app_available = False def setUp(self): + self.test_data_resolver = TestDataResolver() # Setup attributes needed for API testing... server_wrapper = self._test_driver.server_wrappers[0] host = server_wrapper.host diff --git a/test/base/populators.py b/test/base/populators.py index c8388d06bcc2..9da1baa12628 100644 --- a/test/base/populators.py +++ b/test/base/populators.py @@ -148,14 +148,30 @@ def new_dataset_request(self, history_id, content=None, wait=False, **kwds): self.wait_for_tool_run(history_id, run_response, assert_ok=kwds.get('assert_ok', True)) return run_response + def fetch(self, payload, assert_ok=True, timeout=DEFAULT_TIMEOUT): + tool_response = self._post("tools/fetch", data=payload) + if assert_ok: + job = self.check_run(tool_response) + self.wait_for_job(job["id"], timeout=timeout) + + job = tool_response.json()["jobs"][0] + details = self.get_job_details(job["id"]).json() + assert details["state"] == "ok", details + + return tool_response + def wait_for_tool_run(self, history_id, run_response, timeout=DEFAULT_TIMEOUT, assert_ok=True): - run = run_response.json() - assert run_response.status_code == 200, run - job = run["jobs"][0] + job = self.check_run(run_response) self.wait_for_job(job["id"], timeout=timeout) self.wait_for_history(history_id, assert_ok=assert_ok, timeout=timeout) return run_response + def check_run(self, run_response): + run = run_response.json() + assert run_response.status_code == 200, run + job = run["jobs"][0] + return job + def wait_for_history(self, history_id, assert_ok=False, timeout=DEFAULT_TIMEOUT): try: return wait_on_state(lambda: self._get("histories/%s" % history_id), assert_ok=assert_ok, timeout=timeout) @@ -266,8 +282,8 @@ def run_tool(self, tool_id, inputs, history_id, assert_ok=True, **kwds): else: return tool_response - def tools_post(self, payload): - tool_response = self._post("tools", data=payload) + def tools_post(self, payload, url="tools"): + tool_response = self._post(url, data=payload) return tool_response def get_history_dataset_content(self, history_id, wait=True, filename=None, **kwds): @@ -463,6 +479,11 @@ class LibraryPopulator(object): def __init__(self, galaxy_interactor): self.galaxy_interactor = galaxy_interactor + self.dataset_populator = DatasetPopulator(galaxy_interactor) + + def get_libraries(self): + get_response = self.galaxy_interactor.get("libraries") + return get_response.json() def new_private_library(self, name): library = self.new_library(name) @@ -563,6 +584,24 @@ def show(): return library, library_dataset + def get_library_contents_with_path(self, library_id, path): + all_contents_response = self.galaxy_interactor.get("libraries/%s/contents" % library_id) + api_asserts.assert_status_code_is(all_contents_response, 200) + all_contents = all_contents_response.json() + matching = [c for c in all_contents if c["name"] == path] + if len(matching) == 0: + raise Exception("Failed to find library contents with path [%s], contents are %s" % (path, all_contents)) + get_response = self.galaxy_interactor.get(matching[0]["url"]) + api_asserts.assert_status_code_is(get_response, 200) + return get_response.json() + + def setup_fetch_to_folder(self, test_name): + history_id = self.dataset_populator.new_history() + library = self.new_private_library(test_name) + folder_id = library["root_folder_id"][1:] + destination = {"type": "library_folder", "library_folder_id": folder_id} + return history_id, library, destination + class BaseDatasetCollectionPopulator(object): diff --git a/test/integration/test_upload_configuration_options.py b/test/integration/test_upload_configuration_options.py index d5f6789723ca..cdd97bdf8f11 100644 --- a/test/integration/test_upload_configuration_options.py +++ b/test/integration/test_upload_configuration_options.py @@ -19,6 +19,7 @@ framework but tested here for FTP uploads. """ +import json import os import re import shutil @@ -53,6 +54,42 @@ def setUp(self): self.library_populator = LibraryPopulator(self.galaxy_interactor) self.history_id = self.dataset_populator.new_history() + def fetch_target(self, target, assert_ok=False, attach_test_file=False): + payload = { + "history_id": self.history_id, + "targets": json.dumps([target]), + } + if attach_test_file: + payload["__files"] = {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))} + + response = self.dataset_populator.fetch(payload, assert_ok=assert_ok) + return response + + +class InvalidFetchRequestsTestCase(BaseUploadContentConfigurationTestCase): + + def test_in_place_not_allowed(self): + elements = [{"src": "files", "in_place": False}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target, attach_test_file=True) + self._assert_status_code_is(response, 400) + assert 'in_place' in response.json()["err_msg"] + + def test_files_not_attached(self): + elements = [{"src": "files"}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 400) + assert 'Failed to find uploaded file matching target' in response.json()["err_msg"] + class NonAdminsCannotPasteFilePathTestCase(BaseUploadContentConfigurationTestCase): @@ -93,6 +130,26 @@ def test_disallowed_for_libraries(self): response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) assert response.status_code == 403, response.json() + def test_disallowed_for_fetch(self): + elements = [{"src": "path", "path": "%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + + def test_disallowed_for_fetch_urls(self): + elements = [{"src": "url", "url": "file://%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + class AdminsCanPasteFilePathsTestCase(BaseUploadContentConfigurationTestCase): @@ -117,6 +174,26 @@ def test_admin_path_paste_libraries(self): # Was 403 for non-admin above. assert response.status_code == 200 + def test_admin_fetch(self): + elements = [{"src": "path", "path": "%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + + def test_admin_fetch_file_url(self): + elements = [{"src": "url", "url": "file://%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + class DefaultBinaryContentFiltersTestCase(BaseUploadContentConfigurationTestCase): @@ -211,6 +288,16 @@ def test_blocked_url_for_composite_file(self): # the newer API decorator that handles those details. assert create_response.status_code >= 400 + def test_blocked_url_for_fetch(self): + elements = [{"src": "url", "url": "http://localhost"}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + class BaseFtpUploadConfigurationTestCase(BaseUploadContentConfigurationTestCase): @@ -246,6 +333,9 @@ def _ensure_directory(self, path): if not os.path.exists(path): os.makedirs(path) + def _get_user_ftp_path(self): + return os.path.join(self.ftp_dir(), TEST_USER) + class SimpleFtpUploadConfigurationTestCase(BaseFtpUploadConfigurationTestCase): @@ -265,8 +355,24 @@ def test_ftp_upload(self): # ... but it isn't - is this a bug? Are only certain kinds of uploads purged? # assert not os.path.exists(ftp_path) - def _get_user_ftp_path(self): - return os.path.join(self.ftp_dir(), TEST_USER) + def test_ftp_fetch(self): + content = "hello world\n" + dir_path = self._get_user_ftp_path() + ftp_path = self._write_ftp_file(dir_path, content) + ftp_files = self.dataset_populator.get_remote_files() + assert len(ftp_files) == 1, ftp_files + assert ftp_files[0]["path"] == "test" + assert os.path.exists(ftp_path) + elements = [{"src": "ftp_import", "ftp_path": ftp_files[0]["path"]}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + dataset = self.dataset_populator.get_history_dataset_details(self.history_id, hid=2) + self._check_content(dataset, content) class ExplicitEmailAsIdentifierFtpUploadConfigurationTestCase(SimpleFtpUploadConfigurationTestCase): @@ -319,6 +425,50 @@ def test_ftp_uploads_not_purged(self): assert os.path.exists(ftp_path) +class AdvancedFtpUploadFetchTestCase(BaseFtpUploadConfigurationTestCase): + + def test_fetch_ftp_directory(self): + dir_path = self._get_user_ftp_path() + self._write_ftp_file(os.path.join(dir_path, "subdir"), "content 1", filename="1") + self._write_ftp_file(os.path.join(dir_path, "subdir"), "content 22", filename="2") + self._write_ftp_file(os.path.join(dir_path, "subdir"), "content 333", filename="3") + target = { + "destination": {"type": "hdca"}, + "elements_from": "directory", + "src": "ftp_import", + "ftp_path": "subdir", + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + hdca = self.dataset_populator.get_history_collection_details(self.history_id, hid=1) + assert len(hdca["elements"]) == 3, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "1" + assert element0["object"]["file_size"] == 9 + + def test_fetch_nested_elements_from(self): + dir_path = self._get_user_ftp_path() + self._write_ftp_file(os.path.join(dir_path, "subdir1"), "content 1", filename="1") + self._write_ftp_file(os.path.join(dir_path, "subdir1"), "content 22", filename="2") + self._write_ftp_file(os.path.join(dir_path, "subdir2"), "content 333", filename="3") + elements = [ + {"name": "subdirel1", "src": "ftp_import", "ftp_path": "subdir1", "elements_from": "directory", "collection_type": "list"}, + {"name": "subdirel2", "src": "ftp_import", "ftp_path": "subdir2", "elements_from": "directory", "collection_type": "list"}, + ] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list:list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + hdca = self.dataset_populator.get_history_collection_details(self.history_id, hid=1) + assert len(hdca["elements"]) == 2, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "subdirel1" + + class UploadOptionsFtpUploadConfigurationTestCase(BaseFtpUploadConfigurationTestCase): def test_upload_api_option_space_to_tab(self): @@ -447,6 +597,8 @@ def test_server_dir_uploads_403_if_dir_not_set(self): class ServerDirectoryValidUsageTestCase(BaseUploadContentConfigurationTestCase): + # This tests the library contents API - I think equivalent functionality is available via library datasets API + # and should also be tested. require_admin_user = True @@ -470,3 +622,84 @@ def test_library_import_dir_not_available_to_non_admins(self): payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_directory", server_dir="library") response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) assert response.status_code == 403, response.json() + + +class FetchByPathTestCase(BaseUploadContentConfigurationTestCase): + + require_admin_user = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + config["allow_path_paste"] = True + + def test_fetch_path_to_folder(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("simple_fetch") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed") + items = [{"src": "path", "path": bed_test_data_path, "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_link_data_only(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("fetch_and_link") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed") + items = [{"src": "path", "path": bed_test_data_path, "info": "my cool bed", "link_data_only": True}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert dataset["file_name"] == bed_test_data_path, dataset + + def test_fetch_recursive_archive(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("recursive_archive") + bed_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file2") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/dir1/file3") + assert dataset["file_size"] == 11, dataset + + def test_fetch_recursive_archive_to_library(self): + bed_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": {"type": "library", "name": "My Cool Library"}, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": self.history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + libraries = self.library_populator.get_libraries() + matching = [l for l in libraries if l["name"] == "My Cool Library"] + assert len(matching) == 1 + library = matching[0] + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset diff --git a/tools/data_source/upload.py b/tools/data_source/upload.py index 082c700fd43d..7e6fa0b71592 100644 --- a/tools/data_source/upload.py +++ b/tools/data_source/upload.py @@ -18,8 +18,12 @@ from galaxy import util from galaxy.datatypes import sniff -from galaxy.datatypes.binary import Binary from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) from galaxy.util.checkers import ( check_binary, check_bz2, @@ -36,12 +40,6 @@ assert sys.version_info[:2] >= (2, 7) -class UploadProblemException(Exception): - - def __init__(self, message): - self.message = message - - def file_err(msg, dataset, json_file): json_file.write(dumps(dict(type='dataset', ext='data', @@ -118,26 +116,21 @@ def add_file(dataset, registry, json_file, output_path): if dataset.type == 'url': try: - page = urlopen(dataset.path) # page will be .close()ed by sniff methods - temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) - dataset.path = temp_name + # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) + if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') + # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: - # Sniff the data type - guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) - # Set data_type only if guessed_ext is a binary datatype - datatype = registry.get_datatype_by_extension(guessed_ext) - if isinstance(datatype, Binary): - data_type = guessed_ext - ext = guessed_ext + data_type, ext = handle_sniffable_binary_check(data_type, ext, dataset.path, registry) if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): @@ -260,18 +253,9 @@ def add_file(dataset, registry, json_file, output_path): dataset.name = uncompressed_name data_type = 'zip' if not data_type: - if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - parts = dataset.name.split(".") - if len(parts) > 1: - ext = parts[-1].strip().lower() - is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) - if check_content and not is_ext_unsniffable_binary: - raise UploadProblemException('The uploaded binary file contains inappropriate content') - elif is_ext_unsniffable_binary and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) - raise UploadProblemException(err_msg) + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, dataset.path, dataset.name, is_binary, dataset.file_type, check_content, registry + ) if not data_type: # We must have a text file if check_content and check_html(dataset.path):