diff --git a/lib/galaxy/actions/library.py b/lib/galaxy/actions/library.py index 06b44cf3355f..dfd6256705a6 100644 --- a/lib/galaxy/actions/library.py +++ b/lib/galaxy/actions/library.py @@ -257,10 +257,7 @@ def _make_library_uploaded_dataset(self, trans, params, name, path, type, librar uploaded_dataset.link_data_only = link_data_only uploaded_dataset.uuid = uuid_str if link_data_only == 'link_to_files': - uploaded_dataset.data.file_name = os.path.abspath(path) - # Since we are not copying the file into Galaxy's managed - # default file location, the dataset should never be purgable. - uploaded_dataset.data.dataset.purgable = False + uploaded_dataset.data.link_to(path) trans.sa_session.add_all((uploaded_dataset.data, uploaded_dataset.data.dataset)) trans.sa_session.flush() return uploaded_dataset diff --git a/lib/galaxy/app.py b/lib/galaxy/app.py index 25de26171058..ce22ccd7f396 100644 --- a/lib/galaxy/app.py +++ b/lib/galaxy/app.py @@ -12,6 +12,8 @@ from galaxy import config, jobs from galaxy.jobs import metrics as job_metrics from galaxy.managers.collections import DatasetCollectionManager +from galaxy.managers.folders import FolderManager +from galaxy.managers.libraries import LibraryManager from galaxy.managers.tags import GalaxyTagManager from galaxy.openid.providers import OpenIDProviders from galaxy.queue_worker import GalaxyQueueWorker @@ -90,6 +92,8 @@ def __init__(self, **kwargs): self.tag_handler = GalaxyTagManager(self.model.context) # Dataset Collection Plugins self.dataset_collections_service = DatasetCollectionManager(self) + self.library_folder_manager = FolderManager() + self.library_manager = LibraryManager() # Tool Data Tables self._configure_tool_data_tables(from_shed_config=False) diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py index cf058cf069ac..4f019234e142 100644 --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -14,6 +14,7 @@ import zipfile from six import text_type +from six.moves.urllib.request import urlopen from galaxy import util from galaxy.util import compression_utils @@ -39,6 +40,12 @@ def get_test_fname(fname): return full_path +def stream_url_to_file(path): + page = urlopen(path) # page will be .close()ed in stream_to_file + temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + return temp_name + + def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'): """Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor""" # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better diff --git a/lib/galaxy/datatypes/upload_util.py b/lib/galaxy/datatypes/upload_util.py new file mode 100644 index 000000000000..97bb11862ca3 --- /dev/null +++ b/lib/galaxy/datatypes/upload_util.py @@ -0,0 +1,47 @@ +from galaxy.datatypes import sniff +from galaxy.datatypes.binary import Binary + + +class UploadProblemException(Exception): + + def __init__(self, message): + self.message = message + + +def handle_unsniffable_binary_check(data_type, ext, path, name, is_binary, requested_ext, check_content, registry): + """Return modified values of data_type and ext if unsniffable binary encountered. + + Throw UploadProblemException if content problems or extension mismatches occur. + + Precondition: check_binary called returned True. + """ + if is_binary or registry.is_extension_unsniffable_binary(requested_ext): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + parts = name.split(".") + if len(parts) > 1: + ext = parts[-1].strip().lower() + is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) + if check_content and not is_ext_unsniffable_binary: + raise UploadProblemException('The uploaded binary file contains inappropriate content') + + elif is_ext_unsniffable_binary and requested_ext != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) + raise UploadProblemException(err_msg) + return data_type, ext + + +def handle_sniffable_binary_check(data_type, ext, path, registry): + """Return modified values of data_type and ext if sniffable binary encountered. + + Precondition: check_binary called returned True. + """ + # Sniff the data type + guessed_ext = sniff.guess_ext(path, registry.sniff_order) + # Set data_type only if guessed_ext is a binary datatype + datatype = registry.get_datatype_by_extension(guessed_ext) + if isinstance(datatype, Binary): + data_type = guessed_ext + ext = guessed_ext + + return data_type, ext diff --git a/lib/galaxy/dependencies/pinned-requirements.txt b/lib/galaxy/dependencies/pinned-requirements.txt index adddb8247b9f..f5cb58868e9f 100644 --- a/lib/galaxy/dependencies/pinned-requirements.txt +++ b/lib/galaxy/dependencies/pinned-requirements.txt @@ -18,6 +18,7 @@ pysam>=0.13 #python_lzo==1.8 # pure Python packages +bdbag==1.1.1 bz2file==0.98; python_version < '3.3' ipaddress==1.0.18; python_version < '3.3' boltons==17.1.0 diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py index de1c32538bcb..12e638063ac6 100644 --- a/lib/galaxy/jobs/__init__.py +++ b/lib/galaxy/jobs/__init__.py @@ -1371,7 +1371,7 @@ def path_rewriter(path): collected_datasets = { 'primary': self.tool.collect_primary_datasets(out_data, self.get_tool_provided_job_metadata(), tool_working_directory, input_ext, input_dbkey) } - self.tool.collect_dynamic_collections( + self.tool.collect_dynamic_outputs( out_collections, self.get_tool_provided_job_metadata(), job_working_directory=tool_working_directory, diff --git a/lib/galaxy/managers/collections.py b/lib/galaxy/managers/collections.py index e823b0f44ca3..cff0d0cdc241 100644 --- a/lib/galaxy/managers/collections.py +++ b/lib/galaxy/managers/collections.py @@ -46,17 +46,22 @@ def __init__(self, app): self.tag_manager = tags.GalaxyTagManager(app.model.context) self.ldda_manager = lddas.LDDAManager(app) - def precreate_dataset_collection_instance(self, trans, parent, name, implicit_inputs, implicit_output_name, structure): + def precreate_dataset_collection_instance(self, trans, parent, name, structure, implicit_inputs=None, implicit_output_name=None): # TODO: prebuild all required HIDs and send them in so no need to flush in between. - dataset_collection = self.precreate_dataset_collection(structure) + dataset_collection = self.precreate_dataset_collection(structure, allow_unitialized_element=implicit_output_name is not None) instance = self._create_instance_for_collection( trans, parent, name, dataset_collection, implicit_inputs=implicit_inputs, implicit_output_name=implicit_output_name, flush=False ) return instance - def precreate_dataset_collection(self, structure): - if structure.is_leaf or not structure.children_known: - return model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + def precreate_dataset_collection(self, structure, allow_unitialized_element=True): + has_structure = not structure.is_leaf and structure.children_known + if not has_structure and allow_unitialized_element: + dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + elif not has_structure: + collection_type_description = structure.collection_type_description + dataset_collection = model.DatasetCollection(populated=False) + dataset_collection.collection_type = collection_type_description.collection_type else: collection_type_description = structure.collection_type_description dataset_collection = model.DatasetCollection(populated=False) @@ -67,7 +72,7 @@ def precreate_dataset_collection(self, structure): if substructure.is_leaf: element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT else: - element = self.precreate_dataset_collection(substructure) + element = self.precreate_dataset_collection(substructure, allow_unitialized_element=allow_unitialized_element) element = model.DatasetCollectionElement( element=element, @@ -78,7 +83,7 @@ def precreate_dataset_collection(self, structure): dataset_collection.elements = elements dataset_collection.element_count = len(elements) - return dataset_collection + return dataset_collection def create(self, trans, parent, name, collection_type, element_identifiers=None, elements=None, implicit_collection_info=None, trusted_identifiers=None, diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 984dfb883b16..49121faf8463 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -2032,6 +2032,12 @@ def set_file_name(self, filename): return self.dataset.set_file_name(filename) file_name = property(get_file_name, set_file_name) + def link_to(self, path): + self.file_name = os.path.abspath(path) + # Since we are not copying the file into Galaxy's managed + # default file location, the dataset should never be purgable. + self.dataset.purgable = False + @property def extra_files_path(self): return self.dataset.extra_files_path diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index d0d1fcc1d819..a6e478498fcd 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -102,6 +102,7 @@ # Tools that require Galaxy's Python environment to be preserved. GALAXY_LIB_TOOLS_UNVERSIONED = [ "upload1", + "__DATA_FETCH__", # Legacy tools bundled with Galaxy. "vcf_to_maf_customtrack1", "laj_1", @@ -1041,7 +1042,10 @@ def parse_input_elem(self, page_source, enctypes, context=None): group.file_type_name = elem.get('file_type_name', group.file_type_name) group.default_file_type = elem.get('default_file_type', group.default_file_type) group.metadata_ref = elem.get('metadata_ref', group.metadata_ref) - rval[group.file_type_name].refresh_on_change = True + try: + rval[group.file_type_name].refresh_on_change = True + except KeyError: + pass group_page_source = XmlPageSource(elem) group.inputs = self.parse_input_elem(group_page_source, enctypes, context) rval[group.name] = group @@ -1592,10 +1596,10 @@ def collect_primary_datasets(self, output, tool_provided_metadata, job_working_d """ return output_collect.collect_primary_datasets(self, output, tool_provided_metadata, job_working_directory, input_ext, input_dbkey=input_dbkey) - def collect_dynamic_collections(self, output, tool_provided_metadata, **kwds): - """ Find files corresponding to dynamically structured collections. + def collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds): + """Collect dynamic outputs associated with a job from this tool. """ - return output_collect.collect_dynamic_collections(self, output, tool_provided_metadata, **kwds) + return output_collect.collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds) def to_archive(self): tool = self diff --git a/lib/galaxy/tools/actions/upload.py b/lib/galaxy/tools/actions/upload.py index 70bf8152c44a..38f4f210ddeb 100644 --- a/lib/galaxy/tools/actions/upload.py +++ b/lib/galaxy/tools/actions/upload.py @@ -1,5 +1,7 @@ +import json import logging +from galaxy.exceptions import RequestParameterMissingException from galaxy.tools.actions import upload_common from galaxy.util import ExecutionTimer from . import ToolAction @@ -36,3 +38,59 @@ def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, * rval = upload_common.create_job(trans, incoming, tool, json_file_path, data_list, history=history) log.debug("Created upload job %s" % create_job_timer) return rval + + +class FetchUploadToolAction(ToolAction): + + def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, **kwargs): + dataset_upload_inputs = [] + for input_name, input in tool.inputs.items(): + if input.type == "upload_dataset": + dataset_upload_inputs.append(input) + assert dataset_upload_inputs, Exception("No dataset upload groups were found.") + + persisting_uploads_timer = ExecutionTimer() + # precreated_datasets = upload_common.get_precreated_datasets(trans, incoming, trans.app.model.HistoryDatasetAssociation) + incoming = upload_common.persist_uploads(incoming, trans) + log.debug("Persisted uploads %s" % persisting_uploads_timer) + + # Now replace references in requests with these. + files = incoming.get("files", []) + files_iter = iter(files) + request = json.loads(incoming.get("request_json")) + + def replace_file_srcs(request_part): + if isinstance(request_part, dict): + if request_part.get("src", None) == "files": + path_def = next(files_iter) + if path_def is None or path_def["file_data"] is None: + raise RequestParameterMissingException("Failed to find uploaded file matching target with src='files'") + request_part["path"] = path_def["file_data"]["local_filename"] + if "name" not in request_part: + request_part["name"] = path_def["file_data"]["filename"] + request_part["src"] = "path" + else: + for key, value in request_part.items(): + replace_file_srcs(value) + elif isinstance(request_part, list): + for value in request_part: + replace_file_srcs(value) + + replace_file_srcs(request) + + incoming["request_json"] = json.dumps(request) + log.info("incoming are %s" % incoming) + # We can pass an empty string as the cntrller here since it is used to check whether we + # are in an admin view, and this tool is currently not used there. + check_and_cleanup_timer = ExecutionTimer() + # uploaded_datasets = upload_common.get_uploaded_datasets(trans, '', incoming, precreated_datasets, dataset_upload_inputs, history=history) + # upload_common.cleanup_unused_precreated_datasets(precreated_datasets) + + # if not uploaded_datasets: + # return None, 'No data was entered in the upload form, please go back and choose data to upload.' + + log.debug("Checked and cleaned uploads %s" % check_and_cleanup_timer) + create_job_timer = ExecutionTimer() + rval = upload_common.create_job(trans, incoming, tool, None, [], history=history) + log.debug("Created upload job %s" % create_job_timer) + return rval diff --git a/lib/galaxy/tools/actions/upload_common.py b/lib/galaxy/tools/actions/upload_common.py index ae70cae17de3..5857aec9503f 100644 --- a/lib/galaxy/tools/actions/upload_common.py +++ b/lib/galaxy/tools/actions/upload_common.py @@ -16,7 +16,7 @@ from urllib.parse import urlparse from galaxy import datatypes, util -from galaxy.exceptions import ObjectInvalid +from galaxy.exceptions import ConfigDoesNotAllowException, ObjectInvalid from galaxy.managers import tags from galaxy.util import unicodify from galaxy.util.odict import odict @@ -102,7 +102,7 @@ def validate_url(url, ip_whitelist): pass else: # Otherwise, we deny access. - raise Exception("Access to this address in not permitted by server configuration") + raise ConfigDoesNotAllowException("Access to this address in not permitted by server configuration") return url @@ -123,7 +123,7 @@ def persist_uploads(params, trans): local_filename=local_filename) elif type(f) == dict and 'local_filename' not in f: raise Exception('Uploaded file was encoded in a way not understood by Galaxy.') - if upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': + if 'url_paste' in upload_dataset and upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO(validate_url(upload_dataset['url_paste'], trans.app.config.fetch_url_whitelist_ips)), prefix="strio_url_paste_" @@ -334,7 +334,11 @@ def new_upload(trans, cntrller, uploaded_dataset, library_bunch=None, history=No def get_uploaded_datasets(trans, cntrller, params, precreated_datasets, dataset_upload_inputs, library_bunch=None, history=None): uploaded_datasets = [] for dataset_upload_input in dataset_upload_inputs: - uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params)) + try: + uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params)) + except AttributeError: + # TODO: refine... + pass for uploaded_dataset in uploaded_datasets: data = get_precreated_dataset(precreated_datasets, uploaded_dataset.name) if not data: diff --git a/lib/galaxy/tools/data_fetch.py b/lib/galaxy/tools/data_fetch.py new file mode 100644 index 000000000000..0cb87f52485d --- /dev/null +++ b/lib/galaxy/tools/data_fetch.py @@ -0,0 +1,313 @@ +import argparse +import errno +import json +import os +import shutil +import sys +import tempfile + +import bdbag.bdbag_api + +from galaxy.datatypes import sniff +from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) +from galaxy.util import in_directory +from galaxy.util.checkers import ( + check_binary, + check_html, +) +from galaxy.util.compression_utils import CompressedFile + +DESCRIPTION = """Data Import Script""" + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + args = _arg_parser().parse_args(argv) + + registry = Registry() + registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) + + request_path = args.request + assert os.path.exists(request_path) + with open(request_path) as f: + request = json.load(f) + + upload_config = UploadConfig(request, registry) + galaxy_json = _request_to_galaxy_json(upload_config, request) + with open("galaxy.json", "w") as f: + json.dump(galaxy_json, f) + + +def _request_to_galaxy_json(upload_config, request): + targets = request.get("targets", []) + fetched_targets = [] + + for target in targets: + fetched_target = _fetch_target(upload_config, target) + fetched_targets.append(fetched_target) + + return {"__unnamed_outputs": fetched_targets} + + +def _fetch_target(upload_config, target): + destination = target.get("destination", None) + assert destination, "No destination defined." + + def expand_elements_from(target_or_item): + elements_from = target_or_item.get("elements_from", None) + items = None + assert not elements_from or elements_from in ["archive", "bagit", "bagit_archive", "directory"], elements_from + if elements_from == "archive": + decompressed_directory = _decompress_target(target_or_item) + items = _directory_to_items(decompressed_directory) + elif elements_from == "bagit": + _, elements_from_path = _has_src_to_path(target_or_item) + items = _bagit_to_items(elements_from_path) + elif elements_from == "bagit_archive": + decompressed_directory = _decompress_target(target_or_item) + items = _bagit_to_items(decompressed_directory) + elif elements_from == "directory": + _, elements_from_path = _has_src_to_path(target_or_item) + items = _directory_to_items(elements_from_path) + + if items: + del target_or_item["elements_from"] + target_or_item["elements"] = items + + _for_each_src(expand_elements_from, target) + items = target.get("elements", None) + assert items is not None, "No element definition found for destination [%s]" % destination + + fetched_target = {} + fetched_target["destination"] = destination + if "collection_type" in target: + fetched_target["collection_type"] = target["collection_type"] + if "name" in target: + fetched_target["name"] = target["name"] + + def _resolve_src(item): + converted_path = None + + name, path = _has_src_to_path(item) + dbkey = item.get("dbkey", "?") + requested_ext = item.get("ext", "auto") + info = item.get("info", None) + link_data_only = upload_config.link_data_only + if "link_data_only" in item: + # Allow overriding this on a per file basis. + link_data_only = _link_data_only(item) + to_posix_lines = upload_config.get_option(item, "to_posix_lines") + space_to_tab = upload_config.get_option(item, "space_to_tab") + in_place = item.get("in_place", False) + purge_source = item.get("purge_source", True) + + # Follow upload.py logic but without the auto-decompress logic. + registry = upload_config.registry + check_content = upload_config.check_content + data_type, ext = None, requested_ext + + is_binary = check_binary(path) + if is_binary: + data_type, ext = handle_sniffable_binary_check(data_type, ext, path, registry) + if data_type is None: + if is_binary: + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, path, name, is_binary, requested_ext, check_content, registry + ) + if not data_type and check_content and check_html(path): + raise UploadProblemException('The uploaded file contains inappropriate HTML content') + + if data_type != 'binary': + if not link_data_only: + if to_posix_lines: + if space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs(path, in_place=in_place, tmp_dir=".") + else: + line_count, converted_path = sniff.convert_newlines(path, in_place=in_place, tmp_dir=".") + + if requested_ext == 'auto': + ext = sniff.guess_ext(path, registry.sniff_order) + else: + ext = requested_ext + + data_type = ext + + if ext == 'auto' and data_type == 'binary': + ext = 'data' + if ext == 'auto' and requested_ext: + ext = requested_ext + if ext == 'auto': + ext = 'data' + + datatype = registry.get_datatype_by_extension(ext) + if link_data_only: + # Never alter a file that will not be copied to Galaxy's local file store. + if datatype.dataset_content_needs_grooming(path): + err_msg = 'The uploaded files need grooming, so change your Copy data into Galaxy? selection to be ' + \ + 'Copy files into Galaxy instead of Link to files without copying into Galaxy so grooming can be performed.' + raise UploadProblemException(err_msg) + + # If this file is not in the workdir make sure it gets there. + if not link_data_only and converted_path: + path = upload_config.ensure_in_working_directory(converted_path, purge_source, in_place) + elif not link_data_only: + path = upload_config.ensure_in_working_directory(path, purge_source, in_place) + + if not link_data_only and datatype and datatype.dataset_content_needs_grooming(path): + # Groom the dataset content if necessary + datatype.groom_dataset_content(path) + + rval = {"name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only} + if info is not None: + rval["info"] = info + return rval + + elements = elements_tree_map(_resolve_src, items) + + fetched_target["elements"] = elements + return fetched_target + + +def _bagit_to_items(directory): + bdbag.bdbag_api.resolve_fetch(directory) + bdbag.bdbag_api.validate_bag(directory) + items = _directory_to_items(os.path.join(directory, "data")) + return items + + +def _decompress_target(target): + elements_from_name, elements_from_path = _has_src_to_path(target) + temp_directory = tempfile.mkdtemp(prefix=elements_from_name, dir=".") + decompressed_directory = CompressedFile(elements_from_path).extract(temp_directory) + return decompressed_directory + + +def elements_tree_map(f, items): + new_items = [] + for item in items: + if "elements" in item: + new_item = item.copy() + new_item["elements"] = elements_tree_map(f, item["elements"]) + new_items.append(new_item) + else: + new_items.append(f(item)) + return new_items + + +def _directory_to_items(directory): + items = [] + dir_elements = {} + for root, dirs, files in os.walk(directory): + if root in dir_elements: + target = dir_elements[root] + else: + target = items + for dir in dirs: + dir_dict = {"name": dir, "elements": []} + dir_elements[os.path.join(root, dir)] = dir_dict["elements"] + target.append(dir_dict) + for file in files: + target.append({"src": "path", "path": os.path.join(root, file)}) + + return items + + +def _has_src_to_path(item): + assert "src" in item, item + src = item.get("src") + name = item.get("name") + if src == "url": + url = item.get("url") + path = sniff.stream_url_to_file(url) + if name is None: + name = url.split("/")[-1] + else: + assert src == "path" + path = item["path"] + if name is None: + name = os.path.basename(path) + return name, path + + +def _arg_parser(): + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument("--galaxy-root") + parser.add_argument("--datatypes-registry") + parser.add_argument("--request-version") + parser.add_argument("--request") + return parser + + +class UploadConfig(object): + + def __init__(self, request, registry): + self.registry = registry + self.check_content = request.get("check_content" , True) + self.to_posix_lines = request.get("to_posix_lines", False) + self.space_to_tab = request.get("space_to_tab", False) + self.link_data_only = _link_data_only(request) + + self.__workdir = os.path.abspath(".") + self.__upload_count = 0 + + def get_option(self, item, key): + """Return item[key] if specified otherwise use default from UploadConfig. + + This default represents the default for the whole request instead item which + is the option for individual files. + """ + if key in item: + return item[key] + else: + return getattr(self, key) + + def __new_dataset_path(self): + path = "gxupload_%d" % self.__upload_count + self.__upload_count += 1 + return path + + def ensure_in_working_directory(self, path, purge_source, in_place): + if in_directory(path, self.__workdir): + return path + + new_path = self.__new_dataset_path() + if purge_source: + try: + shutil.move(path, new_path) + except OSError as e: + # We may not have permission to remove converted_path + if e.errno != errno.EACCES: + raise + else: + shutil.copy(path, new_path) + + return new_path + + +def _link_data_only(has_config_dict): + link_data_only = has_config_dict.get("link_data_only", False) + if not isinstance(link_data_only, bool): + # Allow the older string values of 'copy_files' and 'link_to_files' + link_data_only = link_data_only == "copy_files" + return link_data_only + + +def _for_each_src(f, obj): + if isinstance(obj, list): + for item in obj: + _for_each_src(f, item) + if isinstance(obj, dict): + if "src" in obj: + f(obj) + for key, value in obj.items(): + _for_each_src(f, value) + + +if __name__ == "__main__": + main() diff --git a/lib/galaxy/tools/data_fetch.xml b/lib/galaxy/tools/data_fetch.xml new file mode 100644 index 000000000000..5160cb2989c8 --- /dev/null +++ b/lib/galaxy/tools/data_fetch.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + $request_json + + + + + diff --git a/lib/galaxy/tools/execute.py b/lib/galaxy/tools/execute.py index 880bc4a38c30..a1dc7e950f4d 100644 --- a/lib/galaxy/tools/execute.py +++ b/lib/galaxy/tools/execute.py @@ -277,9 +277,9 @@ def precreate_output_collections(self, history, params): trans=trans, parent=history, name=output_collection_name, + structure=effective_structure, implicit_inputs=implicit_inputs, implicit_output_name=output_name, - structure=effective_structure, ) collection_instance.implicit_collection_jobs = implicit_collection_jobs collection_instances[output_name] = collection_instance diff --git a/lib/galaxy/tools/parameters/output_collect.py b/lib/galaxy/tools/parameters/output_collect.py index a452d5805125..88b950970bbf 100644 --- a/lib/galaxy/tools/parameters/output_collect.py +++ b/lib/galaxy/tools/parameters/output_collect.py @@ -9,9 +9,11 @@ from collections import namedtuple from galaxy import util +from galaxy.dataset_collections.structure import UnitializedTree from galaxy.tools.parser.output_collection_def import ( DEFAULT_DATASET_COLLECTOR_DESCRIPTION, INPUT_DBKEY_TOKEN, + ToolProvidedMetadataDatasetCollection, ) from galaxy.util import ( ExecutionTimer, @@ -34,6 +36,9 @@ def get_new_dataset_meta_by_basename(self, output_name, basename): def has_failed_outputs(self): return False + def get_unnamed_outputs(self): + return [] + class LegacyToolProvidedMetadata(object): @@ -84,6 +89,9 @@ def has_failed_outputs(self): return found_failed + def get_unnamed_outputs(self): + return [] + class ToolProvidedMetadata(object): @@ -124,14 +132,21 @@ def _elements_to_datasets(self, elements, level=0): def has_failed_outputs(self): found_failed = False - for meta in self.tool_provided_job_metadata.values(): + for output_name, meta in self.tool_provided_job_metadata.items(): + if output_name == "__unnamed_outputs": + continue + if meta.get("failed", False): found_failed = True return found_failed + def get_unnamed_outputs(self): + log.debug("unnamed outputs [%s]" % self.tool_provided_job_metadata) + return self.tool_provided_job_metadata.get("__unnamed_outputs", []) -def collect_dynamic_collections( + +def collect_dynamic_outputs( tool, output_collections, tool_provided_metadata, @@ -140,6 +155,7 @@ def collect_dynamic_collections( job=None, input_dbkey="?", ): + app = tool.app collections_service = tool.app.dataset_collections_service job_context = JobContext( tool, @@ -149,6 +165,88 @@ def collect_dynamic_collections( inp_data, input_dbkey, ) + log.info(tool_provided_metadata) + for unnamed_output_dict in tool_provided_metadata.get_unnamed_outputs(): + assert "destination" in unnamed_output_dict + assert "elements" in unnamed_output_dict + destination = unnamed_output_dict["destination"] + elements = unnamed_output_dict["elements"] + + assert "type" in destination + destination_type = destination["type"] + trans = job_context.work_context + + if destination_type == "library_folder": + + library_folder_manager = app.library_folder_manager + library_folder = library_folder_manager.get(trans, app.security.decode_id(destination.get("library_folder_id"))) + + def add_elements_to_folder(elements, library_folder): + for element in elements: + if "elements" in element: + assert "name" in element + name = element["name"] + description = element.get("description") + nested_folder = library_folder_manager.create(trans, library_folder.id, name, description) + add_elements_to_folder(element["elements"], nested_folder) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory) + fields_match = discovered_file.match + designation = fields_match.designation + visible = fields_match.visible + ext = fields_match.ext + dbkey = fields_match.dbkey + info = element.get("info", None) + link_data = discovered_file.match.link_data + + # Create new primary dataset + name = fields_match.name or designation + + job_context.create_dataset( + ext=ext, + designation=designation, + visible=visible, + dbkey=dbkey, + name=name, + filename=discovered_file.path, + info=info, + library_folder=library_folder, + link_data=link_data + ) + + add_elements_to_folder(elements, library_folder) + elif destination_type == "hdca": + history = job.history + assert "collection_type" in unnamed_output_dict + name = unnamed_output_dict.get("name", "unnamed collection") + collection_type = unnamed_output_dict["collection_type"] + collection_type_description = collections_service.collection_type_descriptions.for_collection_type(collection_type) + structure = UnitializedTree(collection_type_description) + hdca = collections_service.precreate_dataset_collection_instance( + trans, history, name, structure=structure + ) + filenames = odict.odict() + + def add_to_discovered_files(elements, parent_identifiers=[]): + for element in elements: + if "elements" in element: + add_to_discovered_files(element["elements"], parent_identifiers + [element["name"]]) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory, parent_identifiers) + filenames[discovered_file.path] = discovered_file + + add_to_discovered_files(elements) + + collection = hdca.collection + collection_builder = collections_service.collection_builder_for( + collection + ) + job_context.populate_collection_elements( + collection, + collection_builder, + filenames, + ) + collection_builder.populate() for name, has_collection in output_collections.items(): if name not in tool.output_collections: @@ -165,13 +263,19 @@ def collect_dynamic_collections( collection = has_collection try: + collection_builder = collections_service.collection_builder_for( collection ) + dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) + output_name = output_collection_def.name + filenames = job_context.find_files(output_name, collection, dataset_collectors) job_context.populate_collection_elements( collection, collection_builder, - output_collection_def, + filenames, + name=output_collection_def.name, + metadata_source_name=output_collection_def.metadata_source, ) collection_builder.populate() except Exception: @@ -190,6 +294,11 @@ def __init__(self, tool, tool_provided_metadata, job, job_working_directory, inp self.job_working_directory = job_working_directory self.tool_provided_metadata = tool_provided_metadata + @property + def work_context(self): + from galaxy.work.context import WorkRequestContext + return WorkRequestContext(self.app, user=self.job.user) + @property def permissions(self): inp_data = self.inp_data @@ -207,15 +316,14 @@ def find_files(self, output_name, collection, dataset_collectors): filenames[discovered_file.path] = discovered_file return filenames - def populate_collection_elements(self, collection, root_collection_builder, output_collection_def): + def populate_collection_elements(self, collection, root_collection_builder, filenames, name=None, metadata_source_name=None): # TODO: allow configurable sorting. # # # # - dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) - output_name = output_collection_def.name - filenames = self.find_files(output_name, collection, dataset_collectors) + if name is None: + name = "unnamed output" element_datasets = [] for filename, discovered_file in filenames.items(): @@ -234,6 +342,8 @@ def populate_collection_elements(self, collection, root_collection_builder, outp # Create new primary dataset name = fields_match.name or designation + link_data = discovered_file.match.link_data + dataset = self.create_dataset( ext=ext, designation=designation, @@ -241,14 +351,15 @@ def populate_collection_elements(self, collection, root_collection_builder, outp dbkey=dbkey, name=name, filename=filename, - metadata_source_name=output_collection_def.metadata_source, + metadata_source_name=metadata_source_name, + link_data=link_data, ) log.debug( "(%s) Created dynamic collection dataset for path [%s] with element identifier [%s] for output [%s] %s", self.job.id, filename, designation, - output_collection_def.name, + name, create_dataset_timer, ) element_datasets.append((element_identifiers, dataset)) @@ -263,7 +374,7 @@ def populate_collection_elements(self, collection, root_collection_builder, outp log.debug( "(%s) Add dynamic collection datsets to history for output [%s] %s", self.job.id, - output_collection_def.name, + name, add_datasets_timer, ) @@ -293,12 +404,18 @@ def create_dataset( dbkey, name, filename, - metadata_source_name, + metadata_source_name=None, + info=None, + library_folder=None, + link_data=False, ): app = self.app sa_session = self.sa_session - primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + if not library_folder: + primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + else: + primary_data = _new_ldda(self.work_context, name, ext, visible, dbkey, library_folder) # Copy metadata from one of the inputs if requested. metadata_source = None @@ -307,7 +424,11 @@ def create_dataset( sa_session.flush() # Move data from temp location to dataset location - app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) + if not link_data: + app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) + else: + primary_data.link_to(filename) + primary_data.set_size() # If match specified a name use otherwise generate one from # designation. @@ -318,6 +439,9 @@ def create_dataset( else: primary_data.init_meta() + if info is not None: + primary_data.info = info + primary_data.set_meta() primary_data.set_peek() @@ -484,6 +608,20 @@ def discover_files(output_name, tool_provided_metadata, extra_file_collectors, j yield DiscoveredFile(match.path, collector, match) +def discovered_file_for_unnamed_output(dataset, job_working_directory, parent_identifiers=[]): + extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR + target_directory = discover_target_directory(extra_file_collector, job_working_directory) + filename = dataset["filename"] + # handle link_data_only here, verify filename is in directory if not linking... + if not dataset.get("link_data_only"): + path = os.path.join(target_directory, filename) + if not util.in_directory(target_directory, path): + raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.") + else: + path = filename + return DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path, parent_identifiers=parent_identifiers)) + + def discover_target_directory(extra_file_collector, job_working_directory): directory = job_working_directory if extra_file_collector.directory: @@ -585,11 +723,12 @@ def _compose(f, g): class JsonCollectedDatasetMatch(object): - def __init__(self, as_dict, collector, filename, path=None): + def __init__(self, as_dict, collector, filename, path=None, parent_identifiers=[]): self.as_dict = as_dict self.collector = collector self.filename = filename self.path = path + self._parent_identifiers = parent_identifiers @property def designation(self): @@ -607,7 +746,7 @@ def designation(self): @property def element_identifiers(self): - return self.raw_element_identifiers or [self.designation] + return self._parent_identifiers + (self.raw_element_identifiers or [self.designation]) @property def raw_element_identifiers(self): @@ -644,6 +783,10 @@ def visible(self): except KeyError: return self.collector.default_visible + @property + def link_data(self): + return bool(self.as_dict.get("link_data_only", False)) + class RegexCollectedDatasetMatch(JsonCollectedDatasetMatch): @@ -656,6 +799,42 @@ def __init__(self, re_match, collector, filename, path=None): UNSET = object() +def _new_ldda( + trans, + name, + ext, + visible, + dbkey, + library_folder, +): + ld = trans.app.model.LibraryDataset(folder=library_folder, name=name) + trans.sa_session.add(ld) + trans.sa_session.flush() + trans.app.security_agent.copy_library_permissions(trans, library_folder, ld) + + ldda = trans.app.model.LibraryDatasetDatasetAssociation(name=name, + extension=ext, + dbkey=dbkey, + library_dataset=ld, + user=trans.user, + create_dataset=True, + sa_session=trans.sa_session) + trans.sa_session.add(ldda) + ldda.state = ldda.states.OK + # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset + trans.app.security_agent.copy_library_permissions(trans, ld, ldda) + # Copy the current user's DefaultUserPermissions to the new LibraryDatasetDatasetAssociation.dataset + trans.app.security_agent.set_all_dataset_permissions(ldda.dataset, trans.app.security_agent.user_get_default_permissions(trans.user)) + library_folder.add_library_dataset(ld, genome_build=dbkey) + trans.sa_session.add(library_folder) + trans.sa_session.flush() + + ld.library_dataset_dataset_association_id = ldda.id + trans.sa_session.add(ld) + trans.sa_session.flush() + return ldda + + def _new_hda( app, sa_session, @@ -682,3 +861,4 @@ def _new_hda( DEFAULT_DATASET_COLLECTOR = DatasetCollector(DEFAULT_DATASET_COLLECTOR_DESCRIPTION) +DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR = ToolMetadataDatasetCollector(ToolProvidedMetadataDatasetCollection()) diff --git a/lib/galaxy/tools/special_tools.py b/lib/galaxy/tools/special_tools.py index 953e69dee647..129b7064a941 100644 --- a/lib/galaxy/tools/special_tools.py +++ b/lib/galaxy/tools/special_tools.py @@ -4,6 +4,7 @@ SPECIAL_TOOLS = { "history export": "galaxy/tools/imp_exp/exp_history_to_archive.xml", "history import": "galaxy/tools/imp_exp/imp_history_from_archive.xml", + "data fetch": "galaxy/tools/data_fetch.xml", } diff --git a/lib/galaxy/webapps/galaxy/api/_fetch_util.py b/lib/galaxy/webapps/galaxy/api/_fetch_util.py new file mode 100644 index 000000000000..6630dfd79a93 --- /dev/null +++ b/lib/galaxy/webapps/galaxy/api/_fetch_util.py @@ -0,0 +1,205 @@ +import logging +import os + +from galaxy.actions.library import ( + validate_path_upload, + validate_server_directory_upload, +) +from galaxy.exceptions import ( + RequestParameterInvalidException +) +from galaxy.tools.actions.upload_common import validate_url +from galaxy.util import ( + relpath, +) + +log = logging.getLogger(__name__) + +VALID_DESTINATION_TYPES = ["library", "library_folder", "hdca"] +ELEMENTS_FROM_TYPE = ["archive", "bagit", "bagit_archive", "directory"] +# These elements_from cannot be sym linked to because they only exist during upload. +ELEMENTS_FROM_TRANSIENT_TYPES = ["archive", "bagit_archive"] + + +def validate_and_normalize_targets(trans, payload): + """Validate and normalize all src references in fetch targets. + + - Normalize ftp_import and server_dir src entries into simple path entires + with the relevant paths resolved and permissions / configuration checked. + - Check for file:// URLs in items src of "url" and convert them into path + src items - after verifying path pastes are allowed and user is admin. + - Check for valid URLs to be fetched for http and https entries. + - Based on Galaxy configuration and upload types set purge_source and in_place + as needed for each upload. + """ + targets = payload.get("targets", []) + + for target in targets: + destination = _get_required_item(target, "destination", "Each target must specify a 'destination'") + destination_type = _get_required_item(destination, "type", "Each target destination must specify a 'type'") + if destination_type not in VALID_DESTINATION_TYPES: + template = "Invalid target destination type [%s] encountered, must be one of %s" + msg = template % (destination_type, VALID_DESTINATION_TYPES) + raise RequestParameterInvalidException(msg) + if destination_type == "library": + library_name = _get_required_item(destination, "name", "Must specify a library name") + description = destination.get("description", "") + synopsis = destination.get("synopsis", "") + library = trans.app.library_manager.create( + trans, library_name, description=description, synopsis=synopsis + ) + destination["type"] = "library_folder" + for key in ["name", "description", "synopsis"]: + if key in destination: + del destination[key] + destination["library_folder_id"] = trans.app.security.encode_id(library.root_folder.id) + + # Unlike upload.py we don't transmit or use run_as_real_user in the job - we just make sure + # in_place and purge_source are set on the individual upload fetch sources as needed based + # on this. + run_as_real_user = trans.app.config.external_chown_script is None # See comment in upload.py + purge_ftp_source = getattr(trans.app.config, 'ftp_upload_purge', True) and not run_as_real_user + + payload["check_content"] = trans.app.config.check_upload_content + + def check_src(item): + # Normalize file:// URLs into paths. + if item["src"] == "url" and item["url"].startswith("file://"): + item["src"] = "path" + item["path"] = item["url"][len("file://"):] + del item["path"] + + if "in_place" in item: + raise RequestParameterInvalidException("in_place cannot be set in the upload request") + + src = item["src"] + + # Check link_data_only can only be set for certain src types and certain elements_from types. + _handle_invalid_link_data_only_elements_type(item) + if src not in ["path", "server_dir"]: + _handle_invalid_link_data_only_type(item) + elements_from = item.get("elements_from", None) + if elements_from and elements_from not in ELEMENTS_FROM_TYPE: + raise RequestParameterInvalidException("Invalid elements_from/items_from found in request") + + if src == "path" or (src == "url" and item["url"].startswith("file:")): + # Validate is admin, leave alone. + validate_path_upload(trans) + elif src == "server_dir": + # Validate and replace with path definition. + server_dir = item["server_dir"] + full_path, _ = validate_server_directory_upload(trans, server_dir) + item["src"] = "path" + item["path"] = full_path + elif src == "ftp_import": + ftp_path = item["ftp_path"] + full_path = None + + # It'd be nice if this can be de-duplicated with what is in parameters/grouping.py. + user_ftp_dir = trans.user_ftp_dir + is_directory = False + + assert not os.path.islink(user_ftp_dir), "User FTP directory cannot be a symbolic link" + for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): + for filename in filenames: + if ftp_path == filename: + path = relpath(os.path.join(dirpath, filename), user_ftp_dir) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = os.path.abspath(os.path.join(user_ftp_dir, path)) + break + + for dirname in dirnames: + if ftp_path == dirname: + path = relpath(os.path.join(dirpath, dirname), user_ftp_dir) + if not os.path.islink(os.path.join(dirpath, dirname)): + full_path = os.path.abspath(os.path.join(user_ftp_dir, path)) + is_directory = True + break + + if is_directory: + # If the target is a directory - make sure no files under it are symbolic links + for (dirpath, dirnames, filenames) in os.walk(full_path): + for filename in filenames: + if ftp_path == filename: + path = relpath(os.path.join(dirpath, filename), full_path) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = False + break + + for dirname in dirnames: + if ftp_path == dirname: + path = relpath(os.path.join(dirpath, filename), full_path) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = False + break + + if not full_path: + raise RequestParameterInvalidException("Failed to find referenced ftp_path or symbolic link was enountered") + + item["src"] = "path" + item["path"] = full_path + item["purge_source"] = purge_ftp_source + elif src == "url": + url = item["url"] + looks_like_url = False + for url_prefix in ["http://", "https://", "ftp://", "ftps://"]: + if url.startswith(url_prefix): + looks_like_url = True + break + + if not looks_like_url: + raise RequestParameterInvalidException("Invalid URL [%s] found in src definition." % url) + + validate_url(url, trans.app.config.fetch_url_whitelist_ips) + item["in_place"] = run_as_real_user + elif src == "files": + item["in_place"] = run_as_real_user + + _replace_request_syntax_sugar(targets) + _for_each_src(check_src, targets) + + +def _replace_request_syntax_sugar(obj): + # For data libraries and hdas to make sense - allow items and items_from in place of elements + # and elements_from. This is destructive and modifies the supplied request. + if isinstance(obj, list): + for el in obj: + _replace_request_syntax_sugar(el) + elif isinstance(obj, dict): + if "items" in obj: + obj["elements"] = obj["items"] + del obj["items"] + if "items_from" in obj: + obj["elements_from"] = obj["items_from"] + del obj["items_from"] + for value in obj.values(): + _replace_request_syntax_sugar(value) + + +def _handle_invalid_link_data_only_type(item): + link_data_only = item.get("link_data_only", False) + if link_data_only: + raise RequestParameterInvalidException("link_data_only is invalid for src type [%s]" % item.get("src")) + + +def _handle_invalid_link_data_only_elements_type(item): + link_data_only = item.get("link_data_only", False) + if link_data_only and item.get("elements_from", False) in ELEMENTS_FROM_TRANSIENT_TYPES: + raise RequestParameterInvalidException("link_data_only is invalid for derived elements from [%s]" % item.get("elements_from")) + + +def _get_required_item(from_dict, key, message): + if key not in from_dict: + raise RequestParameterInvalidException(message) + return from_dict[key] + + +def _for_each_src(f, obj): + if isinstance(obj, list): + for item in obj: + _for_each_src(f, item) + if isinstance(obj, dict): + if "src" in obj: + f(obj) + for key, value in obj.items(): + _for_each_src(f, value) diff --git a/lib/galaxy/webapps/galaxy/api/tools.py b/lib/galaxy/webapps/galaxy/api/tools.py index 412cc4082d89..107a090b03e8 100644 --- a/lib/galaxy/webapps/galaxy/api/tools.py +++ b/lib/galaxy/webapps/galaxy/api/tools.py @@ -12,9 +12,14 @@ from galaxy.web import _future_expose_api_anonymous_and_sessionless as expose_api_anonymous_and_sessionless from galaxy.web.base.controller import BaseAPIController from galaxy.web.base.controller import UsesVisualizationMixin +from ._fetch_util import validate_and_normalize_targets log = logging.getLogger(__name__) +# Do not allow these tools to be called directly - they (it) enforces extra security and +# provides access via a different API endpoint. +PROTECTED_TOOLS = ["__DATA_FETCH__"] + class ToolsController(BaseAPIController, UsesVisualizationMixin): """ @@ -290,12 +295,52 @@ def download(self, trans, id, **kwds): trans.response.headers["Content-Disposition"] = 'attachment; filename="%s.tgz"' % (id) return download_file + @expose_api_anonymous + def fetch(self, trans, payload, **kwd): + """Adapt clean API to tool-constrained API. + """ + log.info("Keywords are %s" % payload) + request_version = '1' + history_id = payload.pop("history_id") + clean_payload = {} + files_payload = {} + for key, value in payload.items(): + if key == "key": + continue + if key.startswith('files_') or key.startswith('__files_'): + files_payload[key] = value + continue + clean_payload[key] = value + log.info("payload %s" % clean_payload) + validate_and_normalize_targets(trans, clean_payload) + clean_payload["check_content"] = trans.app.config.check_upload_content + request = dumps(clean_payload) + log.info(request) + create_payload = { + 'tool_id': "__DATA_FETCH__", + 'history_id': history_id, + 'inputs': { + 'request_version': request_version, + 'request_json': request, + }, + } + create_payload.update(files_payload) + return self._create(trans, create_payload, **kwd) + @expose_api_anonymous def create(self, trans, payload, **kwd): """ POST /api/tools Executes tool using specified inputs and returns tool's outputs. """ + tool_id = payload.get("tool_id") + if tool_id in PROTECTED_TOOLS: + raise exceptions.RequestParameterInvalidException("Cannot execute tool [%s] directly, must use alternative endpoint." % tool_id) + if tool_id is None: + raise exceptions.RequestParameterInvalidException("Must specify a valid tool_id to use this endpoint.") + return self._create(trans, payload, **kwd) + + def _create(self, trans, payload, **kwd): # HACK: for now, if action is rerun, rerun tool. action = payload.get('action', None) if action == 'rerun': diff --git a/lib/galaxy/webapps/galaxy/buildapp.py b/lib/galaxy/webapps/galaxy/buildapp.py index fa14db1e8063..7a9a60bf91bd 100644 --- a/lib/galaxy/webapps/galaxy/buildapp.py +++ b/lib/galaxy/webapps/galaxy/buildapp.py @@ -268,6 +268,7 @@ def populate_api_routes(webapp, app): # ====== TOOLS API ====== # ======================= + webapp.mapper.connect('/api/tools/fetch', action='fetch', controller='tools', conditions=dict(method=["POST"])) webapp.mapper.connect('/api/tools/all_requirements', action='all_requirements', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/build', action='build', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/reload', action='reload', controller="tools") diff --git a/scripts/api/fetch_to_library.py b/scripts/api/fetch_to_library.py new file mode 100644 index 000000000000..6c497bcb402b --- /dev/null +++ b/scripts/api/fetch_to_library.py @@ -0,0 +1,33 @@ +import argparse +import json + +import requests +import yaml + + +def main(): + parser = argparse.ArgumentParser(description='Upload a directory into a data library') + parser.add_argument("-u", "--url", dest="url", required=True, help="Galaxy URL") + parser.add_argument("-a", "--api", dest="api_key", required=True, help="API Key") + parser.add_argument('target', metavar='FILE', type=str, + help='file describing data library to fetch') + args = parser.parse_args() + with open(args.target, "r") as f: + target = yaml.load(f) + + histories_url = args.url + "/api/histories" + new_history_response = requests.post(histories_url, data={'key': args.api_key}) + + fetch_url = args.url + '/api/tools/fetch' + payload = { + 'key': args.api_key, + 'targets': json.dumps([target]), + 'history_id': new_history_response.json()["id"] + } + + response = requests.post(fetch_url, data=payload) + print(response.content) + + +if __name__ == '__main__': + main() diff --git a/scripts/api/fetch_to_library_example.yml b/scripts/api/fetch_to_library_example.yml new file mode 100644 index 000000000000..44bc35ef43b5 --- /dev/null +++ b/scripts/api/fetch_to_library_example.yml @@ -0,0 +1,42 @@ +destination: + type: library + name: Training Material + description: Data for selected tutorials from https://training.galaxyproject.org. +items: + - name: Quality Control + description: | + Data for sequence quality control tutorial at http://galaxyproject.github.io/training-material/topics/sequence-analysis/tutorials/quality-control/tutorial.html. + + 10.5281/zenodo.61771 + items: + - src: url + url: https://zenodo.org/record/61771/files/GSM461178_untreat_paired_subset_1.fastq + name: GSM461178_untreat_paired_subset_1 + ext: fastqsanger + info: Untreated subseq of GSM461178 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/61771/files/GSM461182_untreat_single_subset.fastq + name: GSM461182_untreat_single_subset + ext: fastqsanger + info: Untreated subseq of GSM461182 from 10.1186/s12864-017-3692-8 + - name: Small RNA-Seq + description: | + Data for small RNA-seq tutorial available at http://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/srna/tutorial.html + + 10.5281/zenodo.826906 + items: + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep1_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep1 + ext: fastqsanger.gz + info: Downsample rep1 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep2_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep2 + ext: fastqsanger.gz + info: Downsample rep2 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep3_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep3 + ext: fastqsanger.gz + info: Downsample rep3 from 10.1186/s12864-017-3692-8 diff --git a/test-data/example-bag.zip b/test-data/example-bag.zip new file mode 100644 index 000000000000..ee4de52c265b Binary files /dev/null and b/test-data/example-bag.zip differ diff --git a/test-data/testdir1.zip b/test-data/testdir1.zip new file mode 100644 index 000000000000..0f71e2ce96aa Binary files /dev/null and b/test-data/testdir1.zip differ diff --git a/test/api/test_dataset_collections.py b/test/api/test_dataset_collections.py index ca8950d8bb0c..87752d2b9096 100644 --- a/test/api/test_dataset_collections.py +++ b/test/api/test_dataset_collections.py @@ -188,6 +188,78 @@ def test_enforces_unique_names(self): create_response = self._post("dataset_collections", payload) self._assert_status_code_is(create_response, 400) + def test_upload_collection(self): + elements = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + "name": "Test upload", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + self.assertEquals(hdca["name"], "Test upload") + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "4.bed" + assert element0["object"]["file_size"] == 61 + + def test_upload_nested(self): + elements = [{"name": "samp1", "elements": [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}]}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list:list", + "name": "Test upload", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + self.assertEquals(hdca["name"], "Test upload") + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "samp1" + + def test_upload_collection_from_url(self): + elements = [{"src": "url", "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed", "info": "my cool bed"}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "4.bed" + assert element0["object"]["file_size"] == 61 + + def _assert_one_collection_created_in_history(self): + contents_response = self._get("histories/%s/contents/dataset_collections" % self.history_id) + self._assert_status_code_is(contents_response, 200) + contents = contents_response.json() + assert len(contents) == 1 + hdca = contents[0] + assert hdca["history_content_type"] == "dataset_collection" + hdca_id = hdca["id"] + collection_response = self._get("histories/%s/contents/dataset_collections/%s" % (self.history_id, hdca_id)) + self._assert_status_code_is(collection_response, 200) + return collection_response.json() + def _check_create_response(self, create_response): self._assert_status_code_is(create_response, 200) dataset_collection = create_response.json() diff --git a/test/api/test_libraries.py b/test/api/test_libraries.py index 2a715f50fcbb..ae0303a8074f 100644 --- a/test/api/test_libraries.py +++ b/test/api/test_libraries.py @@ -1,3 +1,5 @@ +import json + from base import api from base.populators import ( DatasetCollectionPopulator, @@ -95,6 +97,94 @@ def test_create_dataset(self): assert library_dataset["peek"].find("create_test") >= 0 assert library_dataset["file_ext"] == "txt", library_dataset["file_ext"] + def test_fetch_upload_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + items = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert dataset["genome_build"] == "hg19", dataset + assert dataset["misc_info"] == "my cool bed", dataset + assert dataset["file_ext"] == "bed", dataset + + def test_fetch_zip_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "files", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(bed_test_data_path)} + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_single_url_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + items = [{"src": "url", "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_url_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + targets = [{ + "destination": destination, + "items_from": "archive", + "src": "url", + "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed.zip", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_bagit_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("bagit_archive") + example_bag_path = self.test_data_resolver.get_filename("example-bag.zip") + targets = [{ + "destination": destination, + "items_from": "bagit_archive", "src": "files", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(example_bag_path)}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/README.txt") + assert dataset["file_size"] == 66, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/bdbag-profile.json") + assert dataset["file_size"] == 723, dataset + + def _setup_fetch_to_folder(self, test_name): + return self.library_populator.setup_fetch_to_folder(test_name) + def test_create_dataset_in_folder(self): library = self.library_populator.new_private_library("ForCreateDatasets") folder_response = self._create_folder(library) diff --git a/test/base/integration_util.py b/test/base/integration_util.py index 4339436a9b4a..8910f9b82996 100644 --- a/test/base/integration_util.py +++ b/test/base/integration_util.py @@ -7,6 +7,7 @@ import os from unittest import skip, TestCase +from galaxy.tools.verify.test_data import TestDataResolver from .api import UsesApiTestCaseMixin from .driver_util import GalaxyTestDriver @@ -45,6 +46,7 @@ def tearDownClass(cls): cls._app_available = False def setUp(self): + self.test_data_resolver = TestDataResolver() # Setup attributes needed for API testing... server_wrapper = self._test_driver.server_wrappers[0] host = server_wrapper.host diff --git a/test/base/populators.py b/test/base/populators.py index c8388d06bcc2..9da1baa12628 100644 --- a/test/base/populators.py +++ b/test/base/populators.py @@ -148,14 +148,30 @@ def new_dataset_request(self, history_id, content=None, wait=False, **kwds): self.wait_for_tool_run(history_id, run_response, assert_ok=kwds.get('assert_ok', True)) return run_response + def fetch(self, payload, assert_ok=True, timeout=DEFAULT_TIMEOUT): + tool_response = self._post("tools/fetch", data=payload) + if assert_ok: + job = self.check_run(tool_response) + self.wait_for_job(job["id"], timeout=timeout) + + job = tool_response.json()["jobs"][0] + details = self.get_job_details(job["id"]).json() + assert details["state"] == "ok", details + + return tool_response + def wait_for_tool_run(self, history_id, run_response, timeout=DEFAULT_TIMEOUT, assert_ok=True): - run = run_response.json() - assert run_response.status_code == 200, run - job = run["jobs"][0] + job = self.check_run(run_response) self.wait_for_job(job["id"], timeout=timeout) self.wait_for_history(history_id, assert_ok=assert_ok, timeout=timeout) return run_response + def check_run(self, run_response): + run = run_response.json() + assert run_response.status_code == 200, run + job = run["jobs"][0] + return job + def wait_for_history(self, history_id, assert_ok=False, timeout=DEFAULT_TIMEOUT): try: return wait_on_state(lambda: self._get("histories/%s" % history_id), assert_ok=assert_ok, timeout=timeout) @@ -266,8 +282,8 @@ def run_tool(self, tool_id, inputs, history_id, assert_ok=True, **kwds): else: return tool_response - def tools_post(self, payload): - tool_response = self._post("tools", data=payload) + def tools_post(self, payload, url="tools"): + tool_response = self._post(url, data=payload) return tool_response def get_history_dataset_content(self, history_id, wait=True, filename=None, **kwds): @@ -463,6 +479,11 @@ class LibraryPopulator(object): def __init__(self, galaxy_interactor): self.galaxy_interactor = galaxy_interactor + self.dataset_populator = DatasetPopulator(galaxy_interactor) + + def get_libraries(self): + get_response = self.galaxy_interactor.get("libraries") + return get_response.json() def new_private_library(self, name): library = self.new_library(name) @@ -563,6 +584,24 @@ def show(): return library, library_dataset + def get_library_contents_with_path(self, library_id, path): + all_contents_response = self.galaxy_interactor.get("libraries/%s/contents" % library_id) + api_asserts.assert_status_code_is(all_contents_response, 200) + all_contents = all_contents_response.json() + matching = [c for c in all_contents if c["name"] == path] + if len(matching) == 0: + raise Exception("Failed to find library contents with path [%s], contents are %s" % (path, all_contents)) + get_response = self.galaxy_interactor.get(matching[0]["url"]) + api_asserts.assert_status_code_is(get_response, 200) + return get_response.json() + + def setup_fetch_to_folder(self, test_name): + history_id = self.dataset_populator.new_history() + library = self.new_private_library(test_name) + folder_id = library["root_folder_id"][1:] + destination = {"type": "library_folder", "library_folder_id": folder_id} + return history_id, library, destination + class BaseDatasetCollectionPopulator(object): diff --git a/test/integration/test_upload_configuration_options.py b/test/integration/test_upload_configuration_options.py index d5f6789723ca..cdd97bdf8f11 100644 --- a/test/integration/test_upload_configuration_options.py +++ b/test/integration/test_upload_configuration_options.py @@ -19,6 +19,7 @@ framework but tested here for FTP uploads. """ +import json import os import re import shutil @@ -53,6 +54,42 @@ def setUp(self): self.library_populator = LibraryPopulator(self.galaxy_interactor) self.history_id = self.dataset_populator.new_history() + def fetch_target(self, target, assert_ok=False, attach_test_file=False): + payload = { + "history_id": self.history_id, + "targets": json.dumps([target]), + } + if attach_test_file: + payload["__files"] = {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))} + + response = self.dataset_populator.fetch(payload, assert_ok=assert_ok) + return response + + +class InvalidFetchRequestsTestCase(BaseUploadContentConfigurationTestCase): + + def test_in_place_not_allowed(self): + elements = [{"src": "files", "in_place": False}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target, attach_test_file=True) + self._assert_status_code_is(response, 400) + assert 'in_place' in response.json()["err_msg"] + + def test_files_not_attached(self): + elements = [{"src": "files"}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 400) + assert 'Failed to find uploaded file matching target' in response.json()["err_msg"] + class NonAdminsCannotPasteFilePathTestCase(BaseUploadContentConfigurationTestCase): @@ -93,6 +130,26 @@ def test_disallowed_for_libraries(self): response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) assert response.status_code == 403, response.json() + def test_disallowed_for_fetch(self): + elements = [{"src": "path", "path": "%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + + def test_disallowed_for_fetch_urls(self): + elements = [{"src": "url", "url": "file://%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + class AdminsCanPasteFilePathsTestCase(BaseUploadContentConfigurationTestCase): @@ -117,6 +174,26 @@ def test_admin_path_paste_libraries(self): # Was 403 for non-admin above. assert response.status_code == 200 + def test_admin_fetch(self): + elements = [{"src": "path", "path": "%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + + def test_admin_fetch_file_url(self): + elements = [{"src": "url", "url": "file://%s/1.txt" % TEST_DATA_DIRECTORY}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + class DefaultBinaryContentFiltersTestCase(BaseUploadContentConfigurationTestCase): @@ -211,6 +288,16 @@ def test_blocked_url_for_composite_file(self): # the newer API decorator that handles those details. assert create_response.status_code >= 400 + def test_blocked_url_for_fetch(self): + elements = [{"src": "url", "url": "http://localhost"}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + class BaseFtpUploadConfigurationTestCase(BaseUploadContentConfigurationTestCase): @@ -246,6 +333,9 @@ def _ensure_directory(self, path): if not os.path.exists(path): os.makedirs(path) + def _get_user_ftp_path(self): + return os.path.join(self.ftp_dir(), TEST_USER) + class SimpleFtpUploadConfigurationTestCase(BaseFtpUploadConfigurationTestCase): @@ -265,8 +355,24 @@ def test_ftp_upload(self): # ... but it isn't - is this a bug? Are only certain kinds of uploads purged? # assert not os.path.exists(ftp_path) - def _get_user_ftp_path(self): - return os.path.join(self.ftp_dir(), TEST_USER) + def test_ftp_fetch(self): + content = "hello world\n" + dir_path = self._get_user_ftp_path() + ftp_path = self._write_ftp_file(dir_path, content) + ftp_files = self.dataset_populator.get_remote_files() + assert len(ftp_files) == 1, ftp_files + assert ftp_files[0]["path"] == "test" + assert os.path.exists(ftp_path) + elements = [{"src": "ftp_import", "ftp_path": ftp_files[0]["path"]}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + dataset = self.dataset_populator.get_history_dataset_details(self.history_id, hid=2) + self._check_content(dataset, content) class ExplicitEmailAsIdentifierFtpUploadConfigurationTestCase(SimpleFtpUploadConfigurationTestCase): @@ -319,6 +425,50 @@ def test_ftp_uploads_not_purged(self): assert os.path.exists(ftp_path) +class AdvancedFtpUploadFetchTestCase(BaseFtpUploadConfigurationTestCase): + + def test_fetch_ftp_directory(self): + dir_path = self._get_user_ftp_path() + self._write_ftp_file(os.path.join(dir_path, "subdir"), "content 1", filename="1") + self._write_ftp_file(os.path.join(dir_path, "subdir"), "content 22", filename="2") + self._write_ftp_file(os.path.join(dir_path, "subdir"), "content 333", filename="3") + target = { + "destination": {"type": "hdca"}, + "elements_from": "directory", + "src": "ftp_import", + "ftp_path": "subdir", + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + hdca = self.dataset_populator.get_history_collection_details(self.history_id, hid=1) + assert len(hdca["elements"]) == 3, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "1" + assert element0["object"]["file_size"] == 9 + + def test_fetch_nested_elements_from(self): + dir_path = self._get_user_ftp_path() + self._write_ftp_file(os.path.join(dir_path, "subdir1"), "content 1", filename="1") + self._write_ftp_file(os.path.join(dir_path, "subdir1"), "content 22", filename="2") + self._write_ftp_file(os.path.join(dir_path, "subdir2"), "content 333", filename="3") + elements = [ + {"name": "subdirel1", "src": "ftp_import", "ftp_path": "subdir1", "elements_from": "directory", "collection_type": "list"}, + {"name": "subdirel2", "src": "ftp_import", "ftp_path": "subdir2", "elements_from": "directory", "collection_type": "list"}, + ] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list:list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + hdca = self.dataset_populator.get_history_collection_details(self.history_id, hid=1) + assert len(hdca["elements"]) == 2, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "subdirel1" + + class UploadOptionsFtpUploadConfigurationTestCase(BaseFtpUploadConfigurationTestCase): def test_upload_api_option_space_to_tab(self): @@ -447,6 +597,8 @@ def test_server_dir_uploads_403_if_dir_not_set(self): class ServerDirectoryValidUsageTestCase(BaseUploadContentConfigurationTestCase): + # This tests the library contents API - I think equivalent functionality is available via library datasets API + # and should also be tested. require_admin_user = True @@ -470,3 +622,84 @@ def test_library_import_dir_not_available_to_non_admins(self): payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_directory", server_dir="library") response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) assert response.status_code == 403, response.json() + + +class FetchByPathTestCase(BaseUploadContentConfigurationTestCase): + + require_admin_user = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + config["allow_path_paste"] = True + + def test_fetch_path_to_folder(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("simple_fetch") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed") + items = [{"src": "path", "path": bed_test_data_path, "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_link_data_only(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("fetch_and_link") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed") + items = [{"src": "path", "path": bed_test_data_path, "info": "my cool bed", "link_data_only": True}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert dataset["file_name"] == bed_test_data_path, dataset + + def test_fetch_recursive_archive(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("recursive_archive") + bed_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file2") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/dir1/file3") + assert dataset["file_size"] == 11, dataset + + def test_fetch_recursive_archive_to_library(self): + bed_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": {"type": "library", "name": "My Cool Library"}, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": self.history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + libraries = self.library_populator.get_libraries() + matching = [l for l in libraries if l["name"] == "My Cool Library"] + assert len(matching) == 1 + library = matching[0] + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset diff --git a/tools/data_source/upload.py b/tools/data_source/upload.py index 082c700fd43d..7e6fa0b71592 100644 --- a/tools/data_source/upload.py +++ b/tools/data_source/upload.py @@ -18,8 +18,12 @@ from galaxy import util from galaxy.datatypes import sniff -from galaxy.datatypes.binary import Binary from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) from galaxy.util.checkers import ( check_binary, check_bz2, @@ -36,12 +40,6 @@ assert sys.version_info[:2] >= (2, 7) -class UploadProblemException(Exception): - - def __init__(self, message): - self.message = message - - def file_err(msg, dataset, json_file): json_file.write(dumps(dict(type='dataset', ext='data', @@ -118,26 +116,21 @@ def add_file(dataset, registry, json_file, output_path): if dataset.type == 'url': try: - page = urlopen(dataset.path) # page will be .close()ed by sniff methods - temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) - dataset.path = temp_name + # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) + if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') + # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: - # Sniff the data type - guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) - # Set data_type only if guessed_ext is a binary datatype - datatype = registry.get_datatype_by_extension(guessed_ext) - if isinstance(datatype, Binary): - data_type = guessed_ext - ext = guessed_ext + data_type, ext = handle_sniffable_binary_check(data_type, ext, dataset.path, registry) if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): @@ -260,18 +253,9 @@ def add_file(dataset, registry, json_file, output_path): dataset.name = uncompressed_name data_type = 'zip' if not data_type: - if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - parts = dataset.name.split(".") - if len(parts) > 1: - ext = parts[-1].strip().lower() - is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) - if check_content and not is_ext_unsniffable_binary: - raise UploadProblemException('The uploaded binary file contains inappropriate content') - elif is_ext_unsniffable_binary and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) - raise UploadProblemException(err_msg) + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, dataset.path, dataset.name, is_binary, dataset.file_type, check_content, registry + ) if not data_type: # We must have a text file if check_content and check_html(dataset.path):