diff --git a/lib/galaxy/actions/library.py b/lib/galaxy/actions/library.py index 1eb363e09dfb..14e2eeac907a 100644 --- a/lib/galaxy/actions/library.py +++ b/lib/galaxy/actions/library.py @@ -249,16 +249,14 @@ def _make_library_uploaded_dataset(self, trans, params, name, path, type, librar uploaded_dataset.to_posix_lines = params.get('to_posix_lines', None) uploaded_dataset.space_to_tab = params.get('space_to_tab', None) uploaded_dataset.tag_using_filenames = params.get('tag_using_filenames', True) + uploaded_dataset.purge_source = getattr(trans.app.config, 'ftp_upload_purge', True) if in_folder: uploaded_dataset.in_folder = in_folder uploaded_dataset.data = upload_common.new_upload(trans, 'api', uploaded_dataset, library_bunch) uploaded_dataset.link_data_only = link_data_only uploaded_dataset.uuid = uuid_str if link_data_only == 'link_to_files': - uploaded_dataset.data.file_name = os.path.abspath(path) - # Since we are not copying the file into Galaxy's managed - # default file location, the dataset should never be purgable. - uploaded_dataset.data.dataset.purgable = False + uploaded_dataset.data.link_to(path) trans.sa_session.add_all((uploaded_dataset.data, uploaded_dataset.data.dataset)) trans.sa_session.flush() return uploaded_dataset diff --git a/lib/galaxy/app.py b/lib/galaxy/app.py index ee03bf35cbef..574324b9aab3 100644 --- a/lib/galaxy/app.py +++ b/lib/galaxy/app.py @@ -12,7 +12,9 @@ from galaxy import config, jobs from galaxy.jobs import metrics as job_metrics from galaxy.managers.collections import DatasetCollectionManager +from galaxy.managers.folders import FolderManager from galaxy.managers.histories import HistoryManager +from galaxy.managers.libraries import LibraryManager from galaxy.managers.tags import GalaxyTagManager from galaxy.openid.providers import OpenIDProviders from galaxy.queue_worker import GalaxyQueueWorker @@ -96,6 +98,8 @@ def __init__(self, **kwargs): self.history_manager = HistoryManager(self) self.dependency_resolvers_view = DependencyResolversView(self) self.test_data_resolver = test_data.TestDataResolver(file_dirs=self.config.tool_test_data_directories) + self.library_folder_manager = FolderManager() + self.library_manager = LibraryManager() # Tool Data Tables self._configure_tool_data_tables(from_shed_config=False) diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py index 197a9bf18e51..c69373b0669f 100644 --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -14,6 +14,7 @@ import zipfile from six import text_type +from six.moves.urllib.request import urlopen from galaxy import util from galaxy.util import compression_utils @@ -39,6 +40,12 @@ def get_test_fname(fname): return full_path +def stream_url_to_file(path): + page = urlopen(path) # page will be .close()ed in stream_to_file + temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + return temp_name + + def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'): """Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor""" # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better @@ -131,7 +138,7 @@ def convert_newlines(fname, in_place=True, tmp_dir=None, tmp_prefix="gxupload"): return (i, temp_name) -def sep2tabs(fname, in_place=True, patt="\\s+"): +def sep2tabs(fname, in_place=True, patt="\\s+", tmp_dir=None, tmp_prefix="gxupload"): """ Transforms in place a 'sep' separated file to a tab separated one @@ -143,13 +150,18 @@ def sep2tabs(fname, in_place=True, patt="\\s+"): '1\\t2\\n3\\t4\\n' """ regexp = re.compile(patt) - fd, temp_name = tempfile.mkstemp() + fd, temp_name = tempfile.mkstemp(prefix=tmp_prefix, dir=tmp_dir) with os.fdopen(fd, "wt") as fp: i = None for i, line in enumerate(open(fname)): - line = line.rstrip('\r\n') - elems = regexp.split(line) - fp.write("%s\n" % '\t'.join(elems)) + if line.endswith("\r"): + line = line.rstrip('\r') + elems = regexp.split(line) + fp.write("%s\r" % '\t'.join(elems)) + else: + line = line.rstrip('\n') + elems = regexp.split(line) + fp.write("%s\n" % '\t'.join(elems)) if i is None: i = 0 else: diff --git a/lib/galaxy/datatypes/upload_util.py b/lib/galaxy/datatypes/upload_util.py new file mode 100644 index 000000000000..97bb11862ca3 --- /dev/null +++ b/lib/galaxy/datatypes/upload_util.py @@ -0,0 +1,47 @@ +from galaxy.datatypes import sniff +from galaxy.datatypes.binary import Binary + + +class UploadProblemException(Exception): + + def __init__(self, message): + self.message = message + + +def handle_unsniffable_binary_check(data_type, ext, path, name, is_binary, requested_ext, check_content, registry): + """Return modified values of data_type and ext if unsniffable binary encountered. + + Throw UploadProblemException if content problems or extension mismatches occur. + + Precondition: check_binary called returned True. + """ + if is_binary or registry.is_extension_unsniffable_binary(requested_ext): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + parts = name.split(".") + if len(parts) > 1: + ext = parts[-1].strip().lower() + is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) + if check_content and not is_ext_unsniffable_binary: + raise UploadProblemException('The uploaded binary file contains inappropriate content') + + elif is_ext_unsniffable_binary and requested_ext != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) + raise UploadProblemException(err_msg) + return data_type, ext + + +def handle_sniffable_binary_check(data_type, ext, path, registry): + """Return modified values of data_type and ext if sniffable binary encountered. + + Precondition: check_binary called returned True. + """ + # Sniff the data type + guessed_ext = sniff.guess_ext(path, registry.sniff_order) + # Set data_type only if guessed_ext is a binary datatype + datatype = registry.get_datatype_by_extension(guessed_ext) + if isinstance(datatype, Binary): + data_type = guessed_ext + ext = guessed_ext + + return data_type, ext diff --git a/lib/galaxy/dependencies/pinned-requirements.txt b/lib/galaxy/dependencies/pinned-requirements.txt index 5e5ec8a5921a..0c7f590ec853 100644 --- a/lib/galaxy/dependencies/pinned-requirements.txt +++ b/lib/galaxy/dependencies/pinned-requirements.txt @@ -15,6 +15,7 @@ uWSGI==2.0.15 pysam==0.14 # pure Python packages +bdbag==1.1.1 bleach==2.1.3 bz2file==0.98; python_version < '3.3' ipaddress==1.0.18; python_version < '3.3' diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py index 9e16f055df8a..73d00a8ec1e3 100644 --- a/lib/galaxy/jobs/__init__.py +++ b/lib/galaxy/jobs/__init__.py @@ -1380,7 +1380,7 @@ def path_rewriter(path): collected_datasets = { 'primary': self.tool.collect_primary_datasets(out_data, self.get_tool_provided_job_metadata(), tool_working_directory, input_ext, input_dbkey) } - self.tool.collect_dynamic_collections( + self.tool.collect_dynamic_outputs( out_collections, self.get_tool_provided_job_metadata(), job_working_directory=tool_working_directory, diff --git a/lib/galaxy/managers/collections.py b/lib/galaxy/managers/collections.py index fe688abad537..cf507c3c2185 100644 --- a/lib/galaxy/managers/collections.py +++ b/lib/galaxy/managers/collections.py @@ -46,17 +46,22 @@ def __init__(self, app): self.tag_manager = tags.GalaxyTagManager(app.model.context) self.ldda_manager = lddas.LDDAManager(app) - def precreate_dataset_collection_instance(self, trans, parent, name, implicit_inputs, implicit_output_name, structure): + def precreate_dataset_collection_instance(self, trans, parent, name, structure, implicit_inputs=None, implicit_output_name=None): # TODO: prebuild all required HIDs and send them in so no need to flush in between. - dataset_collection = self.precreate_dataset_collection(structure) + dataset_collection = self.precreate_dataset_collection(structure, allow_unitialized_element=implicit_output_name is not None) instance = self._create_instance_for_collection( trans, parent, name, dataset_collection, implicit_inputs=implicit_inputs, implicit_output_name=implicit_output_name, flush=False ) return instance - def precreate_dataset_collection(self, structure): - if structure.is_leaf or not structure.children_known: - return model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + def precreate_dataset_collection(self, structure, allow_unitialized_element=True): + has_structure = not structure.is_leaf and structure.children_known + if not has_structure and allow_unitialized_element: + dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + elif not has_structure: + collection_type_description = structure.collection_type_description + dataset_collection = model.DatasetCollection(populated=False) + dataset_collection.collection_type = collection_type_description.collection_type else: collection_type_description = structure.collection_type_description dataset_collection = model.DatasetCollection(populated=False) @@ -67,7 +72,7 @@ def precreate_dataset_collection(self, structure): if substructure.is_leaf: element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT else: - element = self.precreate_dataset_collection(substructure) + element = self.precreate_dataset_collection(substructure, allow_unitialized_element=allow_unitialized_element) element = model.DatasetCollectionElement( element=element, @@ -78,7 +83,7 @@ def precreate_dataset_collection(self, structure): dataset_collection.elements = elements dataset_collection.element_count = len(elements) - return dataset_collection + return dataset_collection def create(self, trans, parent, name, collection_type, element_identifiers=None, elements=None, implicit_collection_info=None, trusted_identifiers=None, diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 66493c441111..67b8dfdf03a2 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -2035,6 +2035,12 @@ def set_file_name(self, filename): return self.dataset.set_file_name(filename) file_name = property(get_file_name, set_file_name) + def link_to(self, path): + self.file_name = os.path.abspath(path) + # Since we are not copying the file into Galaxy's managed + # default file location, the dataset should never be purgable. + self.dataset.purgable = False + @property def extra_files_path(self): return self.dataset.extra_files_path diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index f87ffa635462..719174aba27f 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -103,6 +103,7 @@ # Tools that require Galaxy's Python environment to be preserved. GALAXY_LIB_TOOLS_UNVERSIONED = [ "upload1", + "__DATA_FETCH__", # Legacy tools bundled with Galaxy. "vcf_to_maf_customtrack1", "laj_1", @@ -1107,7 +1108,10 @@ def parse_input_elem(self, page_source, enctypes, context=None): group.file_type_name = elem.get('file_type_name', group.file_type_name) group.default_file_type = elem.get('default_file_type', group.default_file_type) group.metadata_ref = elem.get('metadata_ref', group.metadata_ref) - rval[group.file_type_name].refresh_on_change = True + try: + rval[group.file_type_name].refresh_on_change = True + except KeyError: + pass group_page_source = XmlPageSource(elem) group.inputs = self.parse_input_elem(group_page_source, enctypes, context) rval[group.name] = group @@ -1658,10 +1662,10 @@ def collect_primary_datasets(self, output, tool_provided_metadata, job_working_d """ return output_collect.collect_primary_datasets(self, output, tool_provided_metadata, job_working_directory, input_ext, input_dbkey=input_dbkey) - def collect_dynamic_collections(self, output, tool_provided_metadata, **kwds): - """ Find files corresponding to dynamically structured collections. + def collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds): + """Collect dynamic outputs associated with a job from this tool. """ - return output_collect.collect_dynamic_collections(self, output, tool_provided_metadata, **kwds) + return output_collect.collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds) def to_archive(self): tool = self diff --git a/lib/galaxy/tools/actions/upload.py b/lib/galaxy/tools/actions/upload.py index 7dbb2f47d583..cdcc28b1adea 100644 --- a/lib/galaxy/tools/actions/upload.py +++ b/lib/galaxy/tools/actions/upload.py @@ -1,15 +1,20 @@ +import json import logging +import os +from galaxy.dataset_collections.structure import UnitializedTree +from galaxy.exceptions import RequestParameterMissingException from galaxy.tools.actions import upload_common from galaxy.util import ExecutionTimer +from galaxy.util.bunch import Bunch from . import ToolAction log = logging.getLogger(__name__) -class UploadToolAction(ToolAction): +class BaseUploadToolAction(ToolAction): - def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, **kwargs): + def execute(self, tool, trans, incoming={}, history=None, **kwargs): dataset_upload_inputs = [] for input_name, input in tool.inputs.items(): if input.type == "upload_dataset": @@ -19,18 +24,125 @@ def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, * persisting_uploads_timer = ExecutionTimer() incoming = upload_common.persist_uploads(incoming, trans) log.debug("Persisted uploads %s" % persisting_uploads_timer) + rval = self._setup_job(tool, trans, incoming, dataset_upload_inputs, history) + return rval + + def _setup_job(self, tool, trans, incoming, dataset_upload_inputs, history): + """Take persisted uploads and create a job for given tool.""" + + def _create_job(self, *args, **kwds): + """Wrapper around upload_common.create_job with a timer.""" + create_job_timer = ExecutionTimer() + rval = upload_common.create_job(*args, **kwds) + log.debug("Created upload job %s" % create_job_timer) + return rval + + +class UploadToolAction(BaseUploadToolAction): + + def _setup_job(self, tool, trans, incoming, dataset_upload_inputs, history): check_timer = ExecutionTimer() - # We can pass an empty string as the cntrller here since it is used to check whether we - # are in an admin view, and this tool is currently not used there. uploaded_datasets = upload_common.get_uploaded_datasets(trans, '', incoming, dataset_upload_inputs, history=history) if not uploaded_datasets: return None, 'No data was entered in the upload form, please go back and choose data to upload.' - log.debug("Checked uploads %s" % check_timer) - create_job_timer = ExecutionTimer() json_file_path = upload_common.create_paramfile(trans, uploaded_datasets) data_list = [ud.data for ud in uploaded_datasets] - rval = upload_common.create_job(trans, incoming, tool, json_file_path, data_list, history=history) - log.debug("Created upload job %s" % create_job_timer) - return rval + log.debug("Checked uploads %s" % check_timer) + return self._create_job( + trans, incoming, tool, json_file_path, data_list, history=history + ) + + +class FetchUploadToolAction(BaseUploadToolAction): + + def _setup_job(self, tool, trans, incoming, dataset_upload_inputs, history): + # Now replace references in requests with these. + files = incoming.get("files", []) + files_iter = iter(files) + request = json.loads(incoming.get("request_json")) + + def replace_file_srcs(request_part): + if isinstance(request_part, dict): + if request_part.get("src", None) == "files": + path_def = next(files_iter) + if path_def is None or path_def["file_data"] is None: + raise RequestParameterMissingException("Failed to find uploaded file matching target with src='files'") + request_part["path"] = path_def["file_data"]["local_filename"] + if "name" not in request_part: + request_part["name"] = path_def["file_data"]["filename"] + request_part["src"] = "path" + else: + for key, value in request_part.items(): + replace_file_srcs(value) + elif isinstance(request_part, list): + for value in request_part: + replace_file_srcs(value) + + replace_file_srcs(request) + + outputs = [] + for target in request.get("targets", []): + destination = target.get("destination") + destination_type = destination.get("type") + # Start by just pre-creating HDAs. + if destination_type == "hdas": + if target.get("elements_from"): + # Dynamic collection required I think. + continue + _precreate_fetched_hdas(trans, history, target, outputs) + + if destination_type == "hdca": + _precreate_fetched_collection_instance(trans, history, target, outputs) + + incoming["request_json"] = json.dumps(request) + return self._create_job( + trans, incoming, tool, None, outputs, history=history + ) + + +def _precreate_fetched_hdas(trans, history, target, outputs): + for item in target.get("elements", []): + name = item.get("name", None) + if name is None: + src = item.get("src", None) + if src == "url": + url = item.get("url") + if name is None: + name = url.split("/")[-1] + elif src == "path": + path = item["path"] + if name is None: + name = os.path.basename(path) + + file_type = item.get("ext", "auto") + dbkey = item.get("dbkey", "?") + uploaded_dataset = Bunch( + type='file', name=name, file_type=file_type, dbkey=dbkey + ) + data = upload_common.new_upload(trans, '', uploaded_dataset, library_bunch=None, history=history) + outputs.append(data) + item["object_id"] = data.id + + +def _precreate_fetched_collection_instance(trans, history, target, outputs): + collection_type = target.get("collection_type") + if not collection_type: + # Can't precreate collections of unknown type at this time. + return + + name = target.get("name") + if not name: + return + + collections_service = trans.app.dataset_collections_service + collection_type_description = collections_service.collection_type_descriptions.for_collection_type(collection_type) + structure = UnitializedTree(collection_type_description) + hdca = collections_service.precreate_dataset_collection_instance( + trans, history, name, structure=structure + ) + outputs.append(hdca) + # Following flushed needed for an ID. + trans.sa_session.flush() + target["destination"]["object_id"] = hdca.id diff --git a/lib/galaxy/tools/actions/upload_common.py b/lib/galaxy/tools/actions/upload_common.py index 44550ea1dde0..4a686a107d45 100644 --- a/lib/galaxy/tools/actions/upload_common.py +++ b/lib/galaxy/tools/actions/upload_common.py @@ -16,7 +16,7 @@ from urllib.parse import urlparse from galaxy import datatypes, util -from galaxy.exceptions import ObjectInvalid +from galaxy.exceptions import ConfigDoesNotAllowException, ObjectInvalid from galaxy.managers import tags from galaxy.util import unicodify from galaxy.util.odict import odict @@ -102,7 +102,7 @@ def validate_url(url, ip_whitelist): pass else: # Otherwise, we deny access. - raise Exception("Access to this address in not permitted by server configuration") + raise ConfigDoesNotAllowException("Access to this address in not permitted by server configuration") return url @@ -123,7 +123,7 @@ def persist_uploads(params, trans): local_filename=local_filename) elif type(f) == dict and 'local_filename' not in f: raise Exception('Uploaded file was encoded in a way not understood by Galaxy.') - if upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': + if 'url_paste' in upload_dataset and upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO(validate_url(upload_dataset['url_paste'], trans.app.config.fetch_url_whitelist_ips)), prefix="strio_url_paste_" @@ -380,7 +380,7 @@ def _chown(path): return json_file_path -def create_job(trans, params, tool, json_file_path, data_list, folder=None, history=None, job_params=None): +def create_job(trans, params, tool, json_file_path, outputs, folder=None, history=None, job_params=None): """ Create the upload job. """ @@ -408,21 +408,28 @@ def create_job(trans, params, tool, json_file_path, data_list, folder=None, hist job.add_parameter(name, value) job.add_parameter('paramfile', dumps(json_file_path)) object_store_id = None - for i, dataset in enumerate(data_list): - if folder: - job.add_output_library_dataset('output%i' % i, dataset) + for i, output_object in enumerate(outputs): + output_name = "output%i" % i + if hasattr(output_object, "collection"): + job.add_output_dataset_collection(output_name, output_object) + output_object.job = job else: - job.add_output_dataset('output%i' % i, dataset) - # Create an empty file immediately - if not dataset.dataset.external_filename: - dataset.dataset.object_store_id = object_store_id - try: - trans.app.object_store.create(dataset.dataset) - except ObjectInvalid: - raise Exception('Unable to create output dataset: object store is full') - object_store_id = dataset.dataset.object_store_id - trans.sa_session.add(dataset) - # open( dataset.file_name, "w" ).close() + dataset = output_object + if folder: + job.add_output_library_dataset(output_name, dataset) + else: + job.add_output_dataset(output_name, dataset) + # Create an empty file immediately + if not dataset.dataset.external_filename: + dataset.dataset.object_store_id = object_store_id + try: + trans.app.object_store.create(dataset.dataset) + except ObjectInvalid: + raise Exception('Unable to create output dataset: object store is full') + object_store_id = dataset.dataset.object_store_id + + trans.sa_session.add(output_object) + job.object_store_id = object_store_id job.set_state(job.states.NEW) job.set_handler(tool.get_job_handler(None)) @@ -436,8 +443,9 @@ def create_job(trans, params, tool, json_file_path, data_list, folder=None, hist trans.app.job_manager.job_queue.put(job.id, job.tool_id) trans.log_event("Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id) output = odict() - for i, v in enumerate(data_list): - output['output%i' % i] = v + for i, v in enumerate(outputs): + if not hasattr(output_object, "collection_type"): + output['output%i' % i] = v return job, output diff --git a/lib/galaxy/tools/data_fetch.py b/lib/galaxy/tools/data_fetch.py new file mode 100644 index 000000000000..858d0c8d2349 --- /dev/null +++ b/lib/galaxy/tools/data_fetch.py @@ -0,0 +1,325 @@ +import argparse +import errno +import json +import os +import shutil +import sys +import tempfile + +import bdbag.bdbag_api + +from galaxy.datatypes import sniff +from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) +from galaxy.util import in_directory +from galaxy.util.checkers import ( + check_binary, + check_html, +) +from galaxy.util.compression_utils import CompressedFile + +DESCRIPTION = """Data Import Script""" + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + args = _arg_parser().parse_args(argv) + + registry = Registry() + registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) + + request_path = args.request + assert os.path.exists(request_path) + with open(request_path) as f: + request = json.load(f) + + upload_config = UploadConfig(request, registry) + galaxy_json = _request_to_galaxy_json(upload_config, request) + with open("galaxy.json", "w") as f: + json.dump(galaxy_json, f) + + +def _request_to_galaxy_json(upload_config, request): + targets = request.get("targets", []) + fetched_targets = [] + + for target in targets: + fetched_target = _fetch_target(upload_config, target) + fetched_targets.append(fetched_target) + + return {"__unnamed_outputs": fetched_targets} + + +def _fetch_target(upload_config, target): + destination = target.get("destination", None) + assert destination, "No destination defined." + + def expand_elements_from(target_or_item): + elements_from = target_or_item.get("elements_from", None) + items = None + if elements_from: + if elements_from == "archive": + decompressed_directory = _decompress_target(target_or_item) + items = _directory_to_items(decompressed_directory) + elif elements_from == "bagit": + _, elements_from_path = _has_src_to_path(target_or_item) + items = _bagit_to_items(elements_from_path) + elif elements_from == "bagit_archive": + decompressed_directory = _decompress_target(target_or_item) + items = _bagit_to_items(decompressed_directory) + elif elements_from == "directory": + _, elements_from_path = _has_src_to_path(target_or_item) + items = _directory_to_items(elements_from_path) + else: + raise Exception("Unknown elements from type encountered [%s]" % elements_from) + + if items: + del target_or_item["elements_from"] + target_or_item["elements"] = items + + _for_each_src(expand_elements_from, target) + items = target.get("elements", None) + assert items is not None, "No element definition found for destination [%s]" % destination + + fetched_target = {} + fetched_target["destination"] = destination + if "collection_type" in target: + fetched_target["collection_type"] = target["collection_type"] + if "name" in target: + fetched_target["name"] = target["name"] + + def _resolve_src(item): + converted_path = None + + name, path = _has_src_to_path(item) + dbkey = item.get("dbkey", "?") + requested_ext = item.get("ext", "auto") + info = item.get("info", None) + object_id = item.get("object_id", None) + link_data_only = upload_config.link_data_only + if "link_data_only" in item: + # Allow overriding this on a per file basis. + link_data_only = _link_data_only(item) + to_posix_lines = upload_config.get_option(item, "to_posix_lines") + space_to_tab = upload_config.get_option(item, "space_to_tab") + in_place = item.get("in_place", False) + purge_source = item.get("purge_source", True) + + # Follow upload.py logic but without the auto-decompress logic. + registry = upload_config.registry + check_content = upload_config.check_content + data_type, ext = None, requested_ext + + is_binary = check_binary(path) + if is_binary: + data_type, ext = handle_sniffable_binary_check(data_type, ext, path, registry) + if data_type is None: + root_datatype = registry.get_datatype_by_extension(ext) + if getattr(root_datatype, 'compressed', False): + data_type = 'compressed archive' + ext = ext + elif is_binary: + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, path, name, is_binary, requested_ext, check_content, registry + ) + if not data_type and check_content and check_html(path): + raise UploadProblemException('The uploaded file contains inappropriate HTML content') + + if data_type != 'binary': + if not link_data_only: + if to_posix_lines: + if space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs(path, in_place=in_place, tmp_dir=".") + else: + line_count, converted_path = sniff.convert_newlines(path, in_place=in_place, tmp_dir=".") + else: + if space_to_tab: + line_count, converted_path = sniff.sep2tabs(path, in_place=in_place, tmp_dir=".") + + if requested_ext == 'auto': + ext = sniff.guess_ext(converted_path or path, registry.sniff_order) + else: + ext = requested_ext + + data_type = ext + + if ext == 'auto' and data_type == 'binary': + ext = 'data' + if ext == 'auto' and requested_ext: + ext = requested_ext + if ext == 'auto': + ext = 'data' + + datatype = registry.get_datatype_by_extension(ext) + if link_data_only: + # Never alter a file that will not be copied to Galaxy's local file store. + if datatype.dataset_content_needs_grooming(path): + err_msg = 'The uploaded files need grooming, so change your Copy data into Galaxy? selection to be ' + \ + 'Copy files into Galaxy instead of Link to files without copying into Galaxy so grooming can be performed.' + raise UploadProblemException(err_msg) + + # If this file is not in the workdir make sure it gets there. + if not link_data_only and converted_path: + path = upload_config.ensure_in_working_directory(converted_path, purge_source, in_place) + elif not link_data_only: + path = upload_config.ensure_in_working_directory(path, purge_source, in_place) + + if not link_data_only and datatype and datatype.dataset_content_needs_grooming(path): + # Groom the dataset content if necessary + datatype.groom_dataset_content(path) + + rval = {"name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only} + if info is not None: + rval["info"] = info + if object_id is not None: + rval["object_id"] = object_id + return rval + + elements = elements_tree_map(_resolve_src, items) + + fetched_target["elements"] = elements + return fetched_target + + +def _bagit_to_items(directory): + bdbag.bdbag_api.resolve_fetch(directory) + bdbag.bdbag_api.validate_bag(directory) + items = _directory_to_items(os.path.join(directory, "data")) + return items + + +def _decompress_target(target): + elements_from_name, elements_from_path = _has_src_to_path(target) + temp_directory = tempfile.mkdtemp(prefix=elements_from_name, dir=".") + decompressed_directory = CompressedFile(elements_from_path).extract(temp_directory) + return decompressed_directory + + +def elements_tree_map(f, items): + new_items = [] + for item in items: + if "elements" in item: + new_item = item.copy() + new_item["elements"] = elements_tree_map(f, item["elements"]) + new_items.append(new_item) + else: + new_items.append(f(item)) + return new_items + + +def _directory_to_items(directory): + items = [] + dir_elements = {} + for root, dirs, files in os.walk(directory): + if root in dir_elements: + target = dir_elements[root] + else: + target = items + for dir in dirs: + dir_dict = {"name": dir, "elements": []} + dir_elements[os.path.join(root, dir)] = dir_dict["elements"] + target.append(dir_dict) + for file in files: + target.append({"src": "path", "path": os.path.join(root, file)}) + + return items + + +def _has_src_to_path(item): + assert "src" in item, item + src = item.get("src") + name = item.get("name") + if src == "url": + url = item.get("url") + path = sniff.stream_url_to_file(url) + if name is None: + name = url.split("/")[-1] + else: + assert src == "path" + path = item["path"] + if name is None: + name = os.path.basename(path) + return name, path + + +def _arg_parser(): + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument("--galaxy-root") + parser.add_argument("--datatypes-registry") + parser.add_argument("--request-version") + parser.add_argument("--request") + return parser + + +class UploadConfig(object): + + def __init__(self, request, registry): + self.registry = registry + self.check_content = request.get("check_content" , True) + self.to_posix_lines = request.get("to_posix_lines", False) + self.space_to_tab = request.get("space_to_tab", False) + self.link_data_only = _link_data_only(request) + + self.__workdir = os.path.abspath(".") + self.__upload_count = 0 + + def get_option(self, item, key): + """Return item[key] if specified otherwise use default from UploadConfig. + + This default represents the default for the whole request instead item which + is the option for individual files. + """ + if key in item: + return item[key] + else: + return getattr(self, key) + + def __new_dataset_path(self): + path = "gxupload_%d" % self.__upload_count + self.__upload_count += 1 + return path + + def ensure_in_working_directory(self, path, purge_source, in_place): + if in_directory(path, self.__workdir): + return path + + new_path = self.__new_dataset_path() + if purge_source: + try: + shutil.move(path, new_path) + except OSError as e: + # We may not have permission to remove converted_path + if e.errno != errno.EACCES: + raise + else: + shutil.copy(path, new_path) + + return new_path + + +def _link_data_only(has_config_dict): + link_data_only = has_config_dict.get("link_data_only", False) + if not isinstance(link_data_only, bool): + # Allow the older string values of 'copy_files' and 'link_to_files' + link_data_only = link_data_only == "copy_files" + return link_data_only + + +def _for_each_src(f, obj): + if isinstance(obj, list): + for item in obj: + _for_each_src(f, item) + if isinstance(obj, dict): + if "src" in obj: + f(obj) + for key, value in obj.items(): + _for_each_src(f, value) + + +if __name__ == "__main__": + main() diff --git a/lib/galaxy/tools/data_fetch.xml b/lib/galaxy/tools/data_fetch.xml new file mode 100644 index 000000000000..5160cb2989c8 --- /dev/null +++ b/lib/galaxy/tools/data_fetch.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + $request_json + + + + + diff --git a/lib/galaxy/tools/execute.py b/lib/galaxy/tools/execute.py index 3b1fffa464e8..cda1755aa62b 100644 --- a/lib/galaxy/tools/execute.py +++ b/lib/galaxy/tools/execute.py @@ -275,9 +275,9 @@ def precreate_output_collections(self, history, params): trans=trans, parent=history, name=output_collection_name, + structure=effective_structure, implicit_inputs=implicit_inputs, implicit_output_name=output_name, - structure=effective_structure, ) collection_instance.implicit_collection_jobs = implicit_collection_jobs collection_instances[output_name] = collection_instance diff --git a/lib/galaxy/tools/parameters/output_collect.py b/lib/galaxy/tools/parameters/output_collect.py index f6b106884d72..93bf92eee08f 100644 --- a/lib/galaxy/tools/parameters/output_collect.py +++ b/lib/galaxy/tools/parameters/output_collect.py @@ -9,9 +9,11 @@ from collections import namedtuple from galaxy import util +from galaxy.dataset_collections.structure import UnitializedTree from galaxy.tools.parser.output_collection_def import ( DEFAULT_DATASET_COLLECTOR_DESCRIPTION, INPUT_DBKEY_TOKEN, + ToolProvidedMetadataDatasetCollection, ) from galaxy.util import ( ExecutionTimer, @@ -34,6 +36,9 @@ def get_new_dataset_meta_by_basename(self, output_name, basename): def has_failed_outputs(self): return False + def get_unnamed_outputs(self): + return [] + class LegacyToolProvidedMetadata(object): @@ -84,6 +89,9 @@ def has_failed_outputs(self): return found_failed + def get_unnamed_outputs(self): + return [] + class ToolProvidedMetadata(object): @@ -124,14 +132,21 @@ def _elements_to_datasets(self, elements, level=0): def has_failed_outputs(self): found_failed = False - for meta in self.tool_provided_job_metadata.values(): + for output_name, meta in self.tool_provided_job_metadata.items(): + if output_name == "__unnamed_outputs": + continue + if meta.get("failed", False): found_failed = True return found_failed + def get_unnamed_outputs(self): + log.debug("unnamed outputs [%s]" % self.tool_provided_job_metadata) + return self.tool_provided_job_metadata.get("__unnamed_outputs", []) -def collect_dynamic_collections( + +def collect_dynamic_outputs( tool, output_collections, tool_provided_metadata, @@ -140,6 +155,7 @@ def collect_dynamic_collections( job=None, input_dbkey="?", ): + app = tool.app collections_service = tool.app.dataset_collections_service job_context = JobContext( tool, @@ -149,6 +165,145 @@ def collect_dynamic_collections( inp_data, input_dbkey, ) + # unmapped outputs do not correspond to explicit outputs of the tool, they were inferred entirely + # from the tool provided metadata (e.g. galaxy.json). + for unnamed_output_dict in tool_provided_metadata.get_unnamed_outputs(): + assert "destination" in unnamed_output_dict + assert "elements" in unnamed_output_dict + destination = unnamed_output_dict["destination"] + elements = unnamed_output_dict["elements"] + + assert "type" in destination + destination_type = destination["type"] + assert destination_type in ["library_folder", "hdca", "hdas"] + trans = job_context.work_context + + # three destination types we need to handle here - "library_folder" (place discovered files in a library folder), + # "hdca" (place discovered files in a history dataset collection), and "hdas" (place discovered files in a history + # as stand-alone datasets). + if destination_type == "library_folder": + # populate a library folder (needs to be already have been created) + + library_folder_manager = app.library_folder_manager + library_folder = library_folder_manager.get(trans, app.security.decode_id(destination.get("library_folder_id"))) + + def add_elements_to_folder(elements, library_folder): + for element in elements: + if "elements" in element: + assert "name" in element + name = element["name"] + description = element.get("description") + nested_folder = library_folder_manager.create(trans, library_folder.id, name, description) + add_elements_to_folder(element["elements"], nested_folder) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory) + fields_match = discovered_file.match + designation = fields_match.designation + visible = fields_match.visible + ext = fields_match.ext + dbkey = fields_match.dbkey + info = element.get("info", None) + link_data = discovered_file.match.link_data + + # Create new primary dataset + name = fields_match.name or designation + + job_context.create_dataset( + ext=ext, + designation=designation, + visible=visible, + dbkey=dbkey, + name=name, + filename=discovered_file.path, + info=info, + library_folder=library_folder, + link_data=link_data + ) + + add_elements_to_folder(elements, library_folder) + elif destination_type == "hdca": + # create or populate a dataset collection in the history + history = job.history + assert "collection_type" in unnamed_output_dict + object_id = destination.get("object_id") + if object_id: + sa_session = tool.app.model.context + hdca = sa_session.query(app.model.HistoryDatasetCollectionAssociation).get(int(object_id)) + else: + name = unnamed_output_dict.get("name", "unnamed collection") + collection_type = unnamed_output_dict["collection_type"] + collection_type_description = collections_service.collection_type_descriptions.for_collection_type(collection_type) + structure = UnitializedTree(collection_type_description) + hdca = collections_service.precreate_dataset_collection_instance( + trans, history, name, structure=structure + ) + filenames = odict.odict() + + def add_to_discovered_files(elements, parent_identifiers=[]): + for element in elements: + if "elements" in element: + add_to_discovered_files(element["elements"], parent_identifiers + [element["name"]]) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory, parent_identifiers) + filenames[discovered_file.path] = discovered_file + + add_to_discovered_files(elements) + + collection = hdca.collection + collection_builder = collections_service.collection_builder_for( + collection + ) + job_context.populate_collection_elements( + collection, + collection_builder, + filenames, + ) + collection_builder.populate() + elif destination_type == "hdas": + # discover files as individual datasets for the target history + history = job.history + + datasets = [] + + def collect_elements_for_history(elements): + for element in elements: + if "elements" in element: + collect_elements_for_history(element["elements"]) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory) + fields_match = discovered_file.match + designation = fields_match.designation + ext = fields_match.ext + dbkey = fields_match.dbkey + info = element.get("info", None) + link_data = discovered_file.match.link_data + + # Create new primary dataset + name = fields_match.name or designation + + hda_id = discovered_file.match.object_id + primary_dataset = None + if hda_id: + sa_session = tool.app.model.context + primary_dataset = sa_session.query(app.model.HistoryDatasetAssociation).get(hda_id) + + dataset = job_context.create_dataset( + ext=ext, + designation=designation, + visible=True, + dbkey=dbkey, + name=name, + filename=discovered_file.path, + info=info, + link_data=link_data, + primary_data=primary_dataset, + ) + dataset.raw_set_dataset_state('ok') + if not hda_id: + datasets.append(dataset) + + collect_elements_for_history(elements) + job.history.add_datasets(job_context.sa_session, datasets) for name, has_collection in output_collections.items(): if name not in tool.output_collections: @@ -168,13 +323,19 @@ def collect_dynamic_collections( collection.populated_state = collection.populated_states.NEW try: + collection_builder = collections_service.collection_builder_for( collection ) + dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) + output_name = output_collection_def.name + filenames = job_context.find_files(output_name, collection, dataset_collectors) job_context.populate_collection_elements( collection, collection_builder, - output_collection_def, + filenames, + name=output_collection_def.name, + metadata_source_name=output_collection_def.metadata_source, ) collection_builder.populate() except Exception: @@ -194,6 +355,11 @@ def __init__(self, tool, tool_provided_metadata, job, job_working_directory, inp self.tool_provided_metadata = tool_provided_metadata self._permissions = None + @property + def work_context(self): + from galaxy.work.context import WorkRequestContext + return WorkRequestContext(self.app, user=self.job.user) + @property def permissions(self): if self._permissions is None: @@ -214,15 +380,14 @@ def find_files(self, output_name, collection, dataset_collectors): filenames[discovered_file.path] = discovered_file return filenames - def populate_collection_elements(self, collection, root_collection_builder, output_collection_def): + def populate_collection_elements(self, collection, root_collection_builder, filenames, name=None, metadata_source_name=None): # TODO: allow configurable sorting. # # # # - dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) - output_name = output_collection_def.name - filenames = self.find_files(output_name, collection, dataset_collectors) + if name is None: + name = "unnamed output" element_datasets = [] for filename, discovered_file in filenames.items(): @@ -241,6 +406,8 @@ def populate_collection_elements(self, collection, root_collection_builder, outp # Create new primary dataset name = fields_match.name or designation + link_data = discovered_file.match.link_data + dataset = self.create_dataset( ext=ext, designation=designation, @@ -248,14 +415,15 @@ def populate_collection_elements(self, collection, root_collection_builder, outp dbkey=dbkey, name=name, filename=filename, - metadata_source_name=output_collection_def.metadata_source, + metadata_source_name=metadata_source_name, + link_data=link_data, ) log.debug( "(%s) Created dynamic collection dataset for path [%s] with element identifier [%s] for output [%s] %s", self.job.id, filename, designation, - output_collection_def.name, + name, create_dataset_timer, ) element_datasets.append((element_identifiers, dataset)) @@ -270,7 +438,7 @@ def populate_collection_elements(self, collection, root_collection_builder, outp log.debug( "(%s) Add dynamic collection datsets to history for output [%s] %s", self.job.id, - output_collection_def.name, + name, add_datasets_timer, ) @@ -300,12 +468,24 @@ def create_dataset( dbkey, name, filename, - metadata_source_name, + metadata_source_name=None, + info=None, + library_folder=None, + link_data=False, + primary_data=None, ): app = self.app sa_session = self.sa_session - primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + if primary_data is None: + if not library_folder: + primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + else: + primary_data = _new_ldda(self.work_context, name, ext, visible, dbkey, library_folder) + else: + primary_data.extension = ext + primary_data.visible = visible + primary_data.dbkey = dbkey # Copy metadata from one of the inputs if requested. metadata_source = None @@ -314,7 +494,11 @@ def create_dataset( sa_session.flush() # Move data from temp location to dataset location - app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) + if not link_data: + app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) + else: + primary_data.link_to(filename) + primary_data.set_size() # If match specified a name use otherwise generate one from # designation. @@ -325,6 +509,9 @@ def create_dataset( else: primary_data.init_meta() + if info is not None: + primary_data.info = info + primary_data.set_meta() primary_data.set_peek() @@ -491,6 +678,20 @@ def discover_files(output_name, tool_provided_metadata, extra_file_collectors, j yield DiscoveredFile(match.path, collector, match) +def discovered_file_for_unnamed_output(dataset, job_working_directory, parent_identifiers=[]): + extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR + target_directory = discover_target_directory(extra_file_collector.directory, job_working_directory) + filename = dataset["filename"] + # handle link_data_only here, verify filename is in directory if not linking... + if not dataset.get("link_data_only"): + path = os.path.join(target_directory, filename) + if not util.in_directory(path, target_directory): + raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.") + else: + path = filename + return DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path, parent_identifiers=parent_identifiers)) + + def discover_target_directory(dir_name, job_working_directory): if dir_name: directory = os.path.join(job_working_directory, dir_name) @@ -605,11 +806,12 @@ def _compose(f, g): class JsonCollectedDatasetMatch(object): - def __init__(self, as_dict, collector, filename, path=None): + def __init__(self, as_dict, collector, filename, path=None, parent_identifiers=[]): self.as_dict = as_dict self.collector = collector self.filename = filename self.path = path + self._parent_identifiers = parent_identifiers @property def designation(self): @@ -627,7 +829,7 @@ def designation(self): @property def element_identifiers(self): - return self.raw_element_identifiers or [self.designation] + return self._parent_identifiers + (self.raw_element_identifiers or [self.designation]) @property def raw_element_identifiers(self): @@ -664,6 +866,14 @@ def visible(self): except KeyError: return self.collector.default_visible + @property + def link_data(self): + return bool(self.as_dict.get("link_data_only", False)) + + @property + def object_id(self): + return self.as_dict.get("object_id", None) + class RegexCollectedDatasetMatch(JsonCollectedDatasetMatch): @@ -676,6 +886,42 @@ def __init__(self, re_match, collector, filename, path=None): UNSET = object() +def _new_ldda( + trans, + name, + ext, + visible, + dbkey, + library_folder, +): + ld = trans.app.model.LibraryDataset(folder=library_folder, name=name) + trans.sa_session.add(ld) + trans.sa_session.flush() + trans.app.security_agent.copy_library_permissions(trans, library_folder, ld) + + ldda = trans.app.model.LibraryDatasetDatasetAssociation(name=name, + extension=ext, + dbkey=dbkey, + library_dataset=ld, + user=trans.user, + create_dataset=True, + sa_session=trans.sa_session) + trans.sa_session.add(ldda) + ldda.state = ldda.states.OK + # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset + trans.app.security_agent.copy_library_permissions(trans, ld, ldda) + # Copy the current user's DefaultUserPermissions to the new LibraryDatasetDatasetAssociation.dataset + trans.app.security_agent.set_all_dataset_permissions(ldda.dataset, trans.app.security_agent.user_get_default_permissions(trans.user)) + library_folder.add_library_dataset(ld, genome_build=dbkey) + trans.sa_session.add(library_folder) + trans.sa_session.flush() + + ld.library_dataset_dataset_association_id = ldda.id + trans.sa_session.add(ld) + trans.sa_session.flush() + return ldda + + def _new_hda( app, sa_session, @@ -702,3 +948,4 @@ def _new_hda( DEFAULT_DATASET_COLLECTOR = DatasetCollector(DEFAULT_DATASET_COLLECTOR_DESCRIPTION) +DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR = ToolMetadataDatasetCollector(ToolProvidedMetadataDatasetCollection()) diff --git a/lib/galaxy/tools/special_tools.py b/lib/galaxy/tools/special_tools.py index 953e69dee647..129b7064a941 100644 --- a/lib/galaxy/tools/special_tools.py +++ b/lib/galaxy/tools/special_tools.py @@ -4,6 +4,7 @@ SPECIAL_TOOLS = { "history export": "galaxy/tools/imp_exp/exp_history_to_archive.xml", "history import": "galaxy/tools/imp_exp/imp_history_from_archive.xml", + "data fetch": "galaxy/tools/data_fetch.xml", } diff --git a/lib/galaxy/util/compression_utils.py b/lib/galaxy/util/compression_utils.py index 7c70aaad6445..e4e4734d4d31 100644 --- a/lib/galaxy/util/compression_utils.py +++ b/lib/galaxy/util/compression_utils.py @@ -1,5 +1,10 @@ +from __future__ import absolute_import + import gzip import io +import logging +import os +import tarfile import zipfile from .checkers import ( @@ -8,6 +13,8 @@ is_gzip ) +log = logging.getLogger(__name__) + def get_fileobj(filename, mode="r", compressed_formats=None): """ @@ -45,3 +52,121 @@ def get_fileobj(filename, mode="r", compressed_formats=None): return io.TextIOWrapper(fh, encoding='utf-8') else: return fh + + +class CompressedFile(object): + + def __init__(self, file_path, mode='r'): + if tarfile.is_tarfile(file_path): + self.file_type = 'tar' + elif zipfile.is_zipfile(file_path) and not file_path.endswith('.jar'): + self.file_type = 'zip' + self.file_name = os.path.splitext(os.path.basename(file_path))[0] + if self.file_name.endswith('.tar'): + self.file_name = os.path.splitext(self.file_name)[0] + self.type = self.file_type + method = 'open_%s' % self.file_type + if hasattr(self, method): + self.archive = getattr(self, method)(file_path, mode) + else: + raise NameError('File type %s specified, no open method found.' % self.file_type) + + def extract(self, path): + '''Determine the path to which the archive should be extracted.''' + contents = self.getmembers() + extraction_path = path + common_prefix = '' + if len(contents) == 1: + # The archive contains a single file, return the extraction path. + if self.isfile(contents[0]): + extraction_path = os.path.join(path, self.file_name) + if not os.path.exists(extraction_path): + os.makedirs(extraction_path) + self.archive.extractall(extraction_path) + else: + # Get the common prefix for all the files in the archive. If the common prefix ends with a slash, + # or self.isdir() returns True, the archive contains a single directory with the desired contents. + # Otherwise, it contains multiple files and/or directories at the root of the archive. + common_prefix = os.path.commonprefix([self.getname(item) for item in contents]) + if len(common_prefix) >= 1 and not common_prefix.endswith(os.sep) and self.isdir(self.getmember(common_prefix)): + common_prefix += os.sep + if not common_prefix.endswith(os.sep): + common_prefix = '' + extraction_path = os.path.join(path, self.file_name) + if not os.path.exists(extraction_path): + os.makedirs(extraction_path) + self.archive.extractall(extraction_path) + # Since .zip files store unix permissions separately, we need to iterate through the zip file + # and set permissions on extracted members. + if self.file_type == 'zip': + for zipped_file in contents: + filename = self.getname(zipped_file) + absolute_filepath = os.path.join(extraction_path, filename) + external_attributes = self.archive.getinfo(filename).external_attr + # The 2 least significant bytes are irrelevant, the next two contain unix permissions. + unix_permissions = external_attributes >> 16 + if unix_permissions != 0: + if os.path.exists(absolute_filepath): + os.chmod(absolute_filepath, unix_permissions) + else: + log.warning("Unable to change permission on extracted file '%s' as it does not exist" % absolute_filepath) + return os.path.abspath(os.path.join(extraction_path, common_prefix)) + + def getmembers_tar(self): + return self.archive.getmembers() + + def getmembers_zip(self): + return self.archive.infolist() + + def getname_tar(self, item): + return item.name + + def getname_zip(self, item): + return item.filename + + def getmember(self, name): + for member in self.getmembers(): + if self.getname(member) == name: + return member + + def getmembers(self): + return getattr(self, 'getmembers_%s' % self.type)() + + def getname(self, member): + return getattr(self, 'getname_%s' % self.type)(member) + + def isdir(self, member): + return getattr(self, 'isdir_%s' % self.type)(member) + + def isdir_tar(self, member): + return member.isdir() + + def isdir_zip(self, member): + if member.filename.endswith(os.sep): + return True + return False + + def isfile(self, member): + if not self.isdir(member): + return True + return False + + def open_tar(self, filepath, mode): + return tarfile.open(filepath, mode, errorlevel=0) + + def open_zip(self, filepath, mode): + return zipfile.ZipFile(filepath, mode) + + def zipfile_ok(self, path_to_archive): + """ + This function is a bit pedantic and not functionally necessary. It checks whether there is + no file pointing outside of the extraction, because ZipFile.extractall() has some potential + security holes. See python zipfile documentation for more details. + """ + basename = os.path.realpath(os.path.dirname(path_to_archive)) + zip_archive = zipfile.ZipFile(path_to_archive) + for member in zip_archive.namelist(): + member_path = os.path.realpath(os.path.join(basename, member)) + if not member_path.startswith(basename): + return False + return True diff --git a/lib/galaxy/webapps/galaxy/api/_fetch_util.py b/lib/galaxy/webapps/galaxy/api/_fetch_util.py new file mode 100644 index 000000000000..7c5e2ea8e543 --- /dev/null +++ b/lib/galaxy/webapps/galaxy/api/_fetch_util.py @@ -0,0 +1,217 @@ +import logging +import os + +from galaxy.actions.library import ( + validate_path_upload, + validate_server_directory_upload, +) +from galaxy.exceptions import ( + RequestParameterInvalidException +) +from galaxy.tools.actions.upload_common import validate_url +from galaxy.util import ( + relpath, +) + +log = logging.getLogger(__name__) + +VALID_DESTINATION_TYPES = ["library", "library_folder", "hdca", "hdas"] +ELEMENTS_FROM_TYPE = ["archive", "bagit", "bagit_archive", "directory"] +# These elements_from cannot be sym linked to because they only exist during upload. +ELEMENTS_FROM_TRANSIENT_TYPES = ["archive", "bagit_archive"] + + +def validate_and_normalize_targets(trans, payload): + """Validate and normalize all src references in fetch targets. + + - Normalize ftp_import and server_dir src entries into simple path entires + with the relevant paths resolved and permissions / configuration checked. + - Check for file:// URLs in items src of "url" and convert them into path + src items - after verifying path pastes are allowed and user is admin. + - Check for valid URLs to be fetched for http and https entries. + - Based on Galaxy configuration and upload types set purge_source and in_place + as needed for each upload. + """ + targets = payload.get("targets", []) + + for target in targets: + destination = _get_required_item(target, "destination", "Each target must specify a 'destination'") + destination_type = _get_required_item(destination, "type", "Each target destination must specify a 'type'") + if "object_id" in destination: + raise RequestParameterInvalidException("object_id not allowed to appear in the request.") + + if destination_type not in VALID_DESTINATION_TYPES: + template = "Invalid target destination type [%s] encountered, must be one of %s" + msg = template % (destination_type, VALID_DESTINATION_TYPES) + raise RequestParameterInvalidException(msg) + if destination_type == "library": + library_name = _get_required_item(destination, "name", "Must specify a library name") + description = destination.get("description", "") + synopsis = destination.get("synopsis", "") + library = trans.app.library_manager.create( + trans, library_name, description=description, synopsis=synopsis + ) + destination["type"] = "library_folder" + for key in ["name", "description", "synopsis"]: + if key in destination: + del destination[key] + destination["library_folder_id"] = trans.app.security.encode_id(library.root_folder.id) + + # Unlike upload.py we don't transmit or use run_as_real_user in the job - we just make sure + # in_place and purge_source are set on the individual upload fetch sources as needed based + # on this. + run_as_real_user = trans.app.config.external_chown_script is not None # See comment in upload.py + purge_ftp_source = getattr(trans.app.config, 'ftp_upload_purge', True) and not run_as_real_user + + payload["check_content"] = trans.app.config.check_upload_content + + def check_src(item): + if "object_id" in item: + raise RequestParameterInvalidException("object_id not allowed to appear in the request.") + + # Normalize file:// URLs into paths. + if item["src"] == "url" and item["url"].startswith("file://"): + item["src"] = "path" + item["path"] = item["url"][len("file://"):] + del item["path"] + + if "in_place" in item: + raise RequestParameterInvalidException("in_place cannot be set in the upload request") + + src = item["src"] + + # Check link_data_only can only be set for certain src types and certain elements_from types. + _handle_invalid_link_data_only_elements_type(item) + if src not in ["path", "server_dir"]: + _handle_invalid_link_data_only_type(item) + elements_from = item.get("elements_from", None) + if elements_from and elements_from not in ELEMENTS_FROM_TYPE: + raise RequestParameterInvalidException("Invalid elements_from/items_from found in request") + + if src == "path" or (src == "url" and item["url"].startswith("file:")): + # Validate is admin, leave alone. + validate_path_upload(trans) + elif src == "server_dir": + # Validate and replace with path definition. + server_dir = item["server_dir"] + full_path, _ = validate_server_directory_upload(trans, server_dir) + item["src"] = "path" + item["path"] = full_path + elif src == "ftp_import": + ftp_path = item["ftp_path"] + full_path = None + + # It'd be nice if this can be de-duplicated with what is in parameters/grouping.py. + user_ftp_dir = trans.user_ftp_dir + is_directory = False + + assert not os.path.islink(user_ftp_dir), "User FTP directory cannot be a symbolic link" + for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): + for filename in filenames: + if ftp_path == filename: + path = relpath(os.path.join(dirpath, filename), user_ftp_dir) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = os.path.abspath(os.path.join(user_ftp_dir, path)) + break + + for dirname in dirnames: + if ftp_path == dirname: + path = relpath(os.path.join(dirpath, dirname), user_ftp_dir) + if not os.path.islink(os.path.join(dirpath, dirname)): + full_path = os.path.abspath(os.path.join(user_ftp_dir, path)) + is_directory = True + break + + if is_directory: + # If the target is a directory - make sure no files under it are symbolic links + for (dirpath, dirnames, filenames) in os.walk(full_path): + for filename in filenames: + if ftp_path == filename: + path = relpath(os.path.join(dirpath, filename), full_path) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = False + break + + for dirname in dirnames: + if ftp_path == dirname: + path = relpath(os.path.join(dirpath, filename), full_path) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = False + break + + if not full_path: + raise RequestParameterInvalidException("Failed to find referenced ftp_path or symbolic link was enountered") + + item["src"] = "path" + item["path"] = full_path + item["purge_source"] = purge_ftp_source + elif src == "url": + url = item["url"] + looks_like_url = False + for url_prefix in ["http://", "https://", "ftp://", "ftps://"]: + if url.startswith(url_prefix): + looks_like_url = True + break + + if not looks_like_url: + raise RequestParameterInvalidException("Invalid URL [%s] found in src definition." % url) + + validate_url(url, trans.app.config.fetch_url_whitelist_ips) + item["in_place"] = run_as_real_user + elif src == "files": + item["in_place"] = run_as_real_user + + # Small disagreement with traditional uploads - we purge less by default since whether purging + # happens varies based on upload options in non-obvious ways. + # https://github.com/galaxyproject/galaxy/issues/5361 + if "purge_source" not in item: + item["purge_source"] = False + + _replace_request_syntax_sugar(targets) + _for_each_src(check_src, targets) + + +def _replace_request_syntax_sugar(obj): + # For data libraries and hdas to make sense - allow items and items_from in place of elements + # and elements_from. This is destructive and modifies the supplied request. + if isinstance(obj, list): + for el in obj: + _replace_request_syntax_sugar(el) + elif isinstance(obj, dict): + if "items" in obj: + obj["elements"] = obj["items"] + del obj["items"] + if "items_from" in obj: + obj["elements_from"] = obj["items_from"] + del obj["items_from"] + for value in obj.values(): + _replace_request_syntax_sugar(value) + + +def _handle_invalid_link_data_only_type(item): + link_data_only = item.get("link_data_only", False) + if link_data_only: + raise RequestParameterInvalidException("link_data_only is invalid for src type [%s]" % item.get("src")) + + +def _handle_invalid_link_data_only_elements_type(item): + link_data_only = item.get("link_data_only", False) + if link_data_only and item.get("elements_from", False) in ELEMENTS_FROM_TRANSIENT_TYPES: + raise RequestParameterInvalidException("link_data_only is invalid for derived elements from [%s]" % item.get("elements_from")) + + +def _get_required_item(from_dict, key, message): + if key not in from_dict: + raise RequestParameterInvalidException(message) + return from_dict[key] + + +def _for_each_src(f, obj): + if isinstance(obj, list): + for item in obj: + _for_each_src(f, item) + if isinstance(obj, dict): + if "src" in obj: + f(obj) + for key, value in obj.items(): + _for_each_src(f, value) diff --git a/lib/galaxy/webapps/galaxy/api/library_contents.py b/lib/galaxy/webapps/galaxy/api/library_contents.py index 5b873f240878..58201190f5c8 100644 --- a/lib/galaxy/webapps/galaxy/api/library_contents.py +++ b/lib/galaxy/webapps/galaxy/api/library_contents.py @@ -185,7 +185,8 @@ def create(self, trans, library_id, payload, **kwd): * upload_option: (optional) one of 'upload_file' (default), 'upload_directory' or 'upload_paths' * server_dir: (optional, only if upload_option is 'upload_directory') relative path of the subdirectory of Galaxy - ``library_import_dir`` to upload. All and only the files (i.e. + ``library_import_dir`` (if admin) or ``user_library_import_dir`` + (if non-admin) to upload. All and only the files (i.e. no subdirectories) contained in the specified directory will be uploaded. * filesystem_paths: (optional, only if upload_option is diff --git a/lib/galaxy/webapps/galaxy/api/tools.py b/lib/galaxy/webapps/galaxy/api/tools.py index fa0a0804b7fb..08576ee812d3 100644 --- a/lib/galaxy/webapps/galaxy/api/tools.py +++ b/lib/galaxy/webapps/galaxy/api/tools.py @@ -15,9 +15,14 @@ from galaxy.web import _future_expose_api_raw_anonymous_and_sessionless as expose_api_raw_anonymous_and_sessionless from galaxy.web.base.controller import BaseAPIController from galaxy.web.base.controller import UsesVisualizationMixin +from ._fetch_util import validate_and_normalize_targets log = logging.getLogger(__name__) +# Do not allow these tools to be called directly - they (it) enforces extra security and +# provides access via a different API endpoint. +PROTECTED_TOOLS = ["__DATA_FETCH__"] + class ToolsController(BaseAPIController, UsesVisualizationMixin): """ @@ -361,12 +366,52 @@ def download(self, trans, id, **kwds): trans.response.headers["Content-Disposition"] = 'attachment; filename="%s.tgz"' % (id) return download_file + @expose_api_anonymous + def fetch(self, trans, payload, **kwd): + """Adapt clean API to tool-constrained API. + """ + log.info("Keywords are %s" % payload) + request_version = '1' + history_id = payload.pop("history_id") + clean_payload = {} + files_payload = {} + for key, value in payload.items(): + if key == "key": + continue + if key.startswith('files_') or key.startswith('__files_'): + files_payload[key] = value + continue + clean_payload[key] = value + log.info("payload %s" % clean_payload) + validate_and_normalize_targets(trans, clean_payload) + clean_payload["check_content"] = trans.app.config.check_upload_content + request = dumps(clean_payload) + log.info(request) + create_payload = { + 'tool_id': "__DATA_FETCH__", + 'history_id': history_id, + 'inputs': { + 'request_version': request_version, + 'request_json': request, + }, + } + create_payload.update(files_payload) + return self._create(trans, create_payload, **kwd) + @expose_api_anonymous def create(self, trans, payload, **kwd): """ POST /api/tools Executes tool using specified inputs and returns tool's outputs. """ + tool_id = payload.get("tool_id") + if tool_id in PROTECTED_TOOLS: + raise exceptions.RequestParameterInvalidException("Cannot execute tool [%s] directly, must use alternative endpoint." % tool_id) + if tool_id is None: + raise exceptions.RequestParameterInvalidException("Must specify a valid tool_id to use this endpoint.") + return self._create(trans, payload, **kwd) + + def _create(self, trans, payload, **kwd): # HACK: for now, if action is rerun, rerun tool. action = payload.get('action', None) if action == 'rerun': diff --git a/lib/galaxy/webapps/galaxy/buildapp.py b/lib/galaxy/webapps/galaxy/buildapp.py index f5ec41c4adfe..eda00a26ee40 100644 --- a/lib/galaxy/webapps/galaxy/buildapp.py +++ b/lib/galaxy/webapps/galaxy/buildapp.py @@ -271,6 +271,7 @@ def populate_api_routes(webapp, app): # ====== TOOLS API ====== # ======================= + webapp.mapper.connect('/api/tools/fetch', action='fetch', controller='tools', conditions=dict(method=["POST"])) webapp.mapper.connect('/api/tools/all_requirements', action='all_requirements', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/build', action='build', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/reload', action='reload', controller="tools") diff --git a/lib/galaxy/workflow/modules.py b/lib/galaxy/workflow/modules.py index a7b4303badc3..d090ef512801 100644 --- a/lib/galaxy/workflow/modules.py +++ b/lib/galaxy/workflow/modules.py @@ -825,6 +825,9 @@ def execute(self, trans, progress, invocation_step, use_cached_job=False): invocation = invocation_step.workflow_invocation step = invocation_step.workflow_step tool = trans.app.toolbox.get_tool(step.tool_id, tool_version=step.tool_version) + if not tool.is_workflow_compatible: + message = "Specified tool [%s] in workflow is not workflow-compatible." % tool.id + raise Exception(message) tool_state = step.state # Not strictly needed - but keep Tool state clean by stripping runtime # metadata parameters from it. diff --git a/lib/tool_shed/galaxy_install/tool_dependencies/recipe/step_handler.py b/lib/tool_shed/galaxy_install/tool_dependencies/recipe/step_handler.py index f8dc0ee5cd76..917b2ead48ce 100644 --- a/lib/tool_shed/galaxy_install/tool_dependencies/recipe/step_handler.py +++ b/lib/tool_shed/galaxy_install/tool_dependencies/recipe/step_handler.py @@ -16,6 +16,7 @@ asbool, download_to_file ) +from galaxy.util.compression_utils import CompressedFile from galaxy.util.template import fill_template from tool_shed.galaxy_install.tool_dependencies.env_manager import EnvManager from tool_shed.util import basic_util, tool_dependency_util @@ -25,124 +26,6 @@ VIRTUALENV_URL = 'https://pypi.python.org/packages/d4/0c/9840c08189e030873387a73b90ada981885010dd9aea134d6de30cd24cb8/virtualenv-15.1.0.tar.gz' -class CompressedFile(object): - - def __init__(self, file_path, mode='r'): - if tarfile.is_tarfile(file_path): - self.file_type = 'tar' - elif zipfile.is_zipfile(file_path) and not file_path.endswith('.jar'): - self.file_type = 'zip' - self.file_name = os.path.splitext(os.path.basename(file_path))[0] - if self.file_name.endswith('.tar'): - self.file_name = os.path.splitext(self.file_name)[0] - self.type = self.file_type - method = 'open_%s' % self.file_type - if hasattr(self, method): - self.archive = getattr(self, method)(file_path, mode) - else: - raise NameError('File type %s specified, no open method found.' % self.file_type) - - def extract(self, path): - '''Determine the path to which the archive should be extracted.''' - contents = self.getmembers() - extraction_path = path - common_prefix = '' - if len(contents) == 1: - # The archive contains a single file, return the extraction path. - if self.isfile(contents[0]): - extraction_path = os.path.join(path, self.file_name) - if not os.path.exists(extraction_path): - os.makedirs(extraction_path) - self.archive.extractall(extraction_path) - else: - # Get the common prefix for all the files in the archive. If the common prefix ends with a slash, - # or self.isdir() returns True, the archive contains a single directory with the desired contents. - # Otherwise, it contains multiple files and/or directories at the root of the archive. - common_prefix = os.path.commonprefix([self.getname(item) for item in contents]) - if len(common_prefix) >= 1 and not common_prefix.endswith(os.sep) and self.isdir(self.getmember(common_prefix)): - common_prefix += os.sep - if not common_prefix.endswith(os.sep): - common_prefix = '' - extraction_path = os.path.join(path, self.file_name) - if not os.path.exists(extraction_path): - os.makedirs(extraction_path) - self.archive.extractall(extraction_path) - # Since .zip files store unix permissions separately, we need to iterate through the zip file - # and set permissions on extracted members. - if self.file_type == 'zip': - for zipped_file in contents: - filename = self.getname(zipped_file) - absolute_filepath = os.path.join(extraction_path, filename) - external_attributes = self.archive.getinfo(filename).external_attr - # The 2 least significant bytes are irrelevant, the next two contain unix permissions. - unix_permissions = external_attributes >> 16 - if unix_permissions != 0: - if os.path.exists(absolute_filepath): - os.chmod(absolute_filepath, unix_permissions) - else: - log.warning("Unable to change permission on extracted file '%s' as it does not exist" % absolute_filepath) - return os.path.abspath(os.path.join(extraction_path, common_prefix)) - - def getmembers_tar(self): - return self.archive.getmembers() - - def getmembers_zip(self): - return self.archive.infolist() - - def getname_tar(self, item): - return item.name - - def getname_zip(self, item): - return item.filename - - def getmember(self, name): - for member in self.getmembers(): - if self.getname(member) == name: - return member - - def getmembers(self): - return getattr(self, 'getmembers_%s' % self.type)() - - def getname(self, member): - return getattr(self, 'getname_%s' % self.type)(member) - - def isdir(self, member): - return getattr(self, 'isdir_%s' % self.type)(member) - - def isdir_tar(self, member): - return member.isdir() - - def isdir_zip(self, member): - if member.filename.endswith(os.sep): - return True - return False - - def isfile(self, member): - if not self.isdir(member): - return True - return False - - def open_tar(self, filepath, mode): - return tarfile.open(filepath, mode, errorlevel=0) - - def open_zip(self, filepath, mode): - return zipfile.ZipFile(filepath, mode) - - def zipfile_ok(self, path_to_archive): - """ - This function is a bit pedantic and not functionally necessary. It checks whether there is - no file pointing outside of the extraction, because ZipFile.extractall() has some potential - security holes. See python zipfile documentation for more details. - """ - basename = os.path.realpath(os.path.dirname(path_to_archive)) - zip_archive = zipfile.ZipFile(path_to_archive) - for member in zip_archive.namelist(): - member_path = os.path.realpath(os.path.join(basename, member)) - if not member_path.startswith(basename): - return False - return True - - class Download(object): def url_download(self, install_dir, downloaded_file_name, download_url, extract=True, checksums={}): diff --git a/scripts/api/fetch_to_library.py b/scripts/api/fetch_to_library.py new file mode 100644 index 000000000000..6c497bcb402b --- /dev/null +++ b/scripts/api/fetch_to_library.py @@ -0,0 +1,33 @@ +import argparse +import json + +import requests +import yaml + + +def main(): + parser = argparse.ArgumentParser(description='Upload a directory into a data library') + parser.add_argument("-u", "--url", dest="url", required=True, help="Galaxy URL") + parser.add_argument("-a", "--api", dest="api_key", required=True, help="API Key") + parser.add_argument('target', metavar='FILE', type=str, + help='file describing data library to fetch') + args = parser.parse_args() + with open(args.target, "r") as f: + target = yaml.load(f) + + histories_url = args.url + "/api/histories" + new_history_response = requests.post(histories_url, data={'key': args.api_key}) + + fetch_url = args.url + '/api/tools/fetch' + payload = { + 'key': args.api_key, + 'targets': json.dumps([target]), + 'history_id': new_history_response.json()["id"] + } + + response = requests.post(fetch_url, data=payload) + print(response.content) + + +if __name__ == '__main__': + main() diff --git a/scripts/api/fetch_to_library_example.yml b/scripts/api/fetch_to_library_example.yml new file mode 100644 index 000000000000..44bc35ef43b5 --- /dev/null +++ b/scripts/api/fetch_to_library_example.yml @@ -0,0 +1,42 @@ +destination: + type: library + name: Training Material + description: Data for selected tutorials from https://training.galaxyproject.org. +items: + - name: Quality Control + description: | + Data for sequence quality control tutorial at http://galaxyproject.github.io/training-material/topics/sequence-analysis/tutorials/quality-control/tutorial.html. + + 10.5281/zenodo.61771 + items: + - src: url + url: https://zenodo.org/record/61771/files/GSM461178_untreat_paired_subset_1.fastq + name: GSM461178_untreat_paired_subset_1 + ext: fastqsanger + info: Untreated subseq of GSM461178 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/61771/files/GSM461182_untreat_single_subset.fastq + name: GSM461182_untreat_single_subset + ext: fastqsanger + info: Untreated subseq of GSM461182 from 10.1186/s12864-017-3692-8 + - name: Small RNA-Seq + description: | + Data for small RNA-seq tutorial available at http://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/srna/tutorial.html + + 10.5281/zenodo.826906 + items: + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep1_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep1 + ext: fastqsanger.gz + info: Downsample rep1 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep2_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep2 + ext: fastqsanger.gz + info: Downsample rep2 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep3_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep3 + ext: fastqsanger.gz + info: Downsample rep3 from 10.1186/s12864-017-3692-8 diff --git a/test-data/1.csv b/test-data/1.csv new file mode 100644 index 000000000000..80d519fbe2d9 --- /dev/null +++ b/test-data/1.csv @@ -0,0 +1 @@ +Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude 1/2/09 6:17,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/09 6:00,1/2/09 6:08,51.5,-1.1166667 1/2/09 4:53,Product1,1200,Visa,Betina,Parkville ,MO,United States,1/2/09 4:42,1/2/09 7:49,39.195,-94.68194 1/2/09 13:08,Product1,1200,Mastercard,Federica e Andrea,Astoria ,OR,United States,1/1/09 16:21,1/3/09 12:32,46.18806,-123.83 1/3/09 14:44,Product1,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/05 21:13,1/3/09 14:22,-36.1333333,144.75 1/4/09 12:56,Product2,3600,Visa,Gerd W ,Cahaba Heights ,AL,United States,11/15/08 15:47,1/4/09 12:45,33.52056,-86.8025 1/4/09 13:19,Product1,1200,Visa,LAURENCE,Mickleton ,NJ,United States,9/24/08 15:19,1/4/09 13:04,39.79,-75.23806 1/4/09 20:11,Product1,1200,Mastercard,Fleur,Peoria ,IL,United States,1/3/09 9:38,1/4/09 19:45,40.69361,-89.58889 1/2/09 20:09,Product1,1200,Mastercard,adam,Martin ,TN,United States,1/2/09 17:43,1/4/09 20:01,36.34333,-88.85028 1/4/09 13:17,Product1,1200,Mastercard,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel,1/4/09 13:03,1/4/09 22:10,32.0666667,34.7666667 1/4/09 14:11,Product1,1200,Visa,Aidan,Chatou,Ile-de-France,France,6/3/08 4:22,1/5/09 1:17,48.8833333,2.15 1/5/09 2:42,Product1,1200,Diners,Stacy,New York ,NY,United States,1/5/09 2:23,1/5/09 4:59,40.71417,-74.00639 1/5/09 5:39,Product1,1200,Amex,Heidi,Eindhoven,Noord-Brabant,Netherlands,1/5/09 4:55,1/5/09 8:15,51.45,5.4666667 1/2/09 9:16,Product1,1200,Mastercard,Sean ,Shavano Park ,TX,United States,1/2/09 8:32,1/5/09 9:05,29.42389,-98.49333 1/5/09 10:08,Product1,1200,Visa,Georgia,Eagle ,ID,United States,11/11/08 15:53,1/5/09 10:05,43.69556,-116.35306 1/2/09 14:18,Product1,1200,Visa,Richard,Riverside ,NJ,United States,12/9/08 12:07,1/5/09 11:01,40.03222,-74.95778 1/25/09 17:58,Product2,3600,Visa,carol,Ann Arbor ,MI,United States,7/5/08 9:20,2/7/09 18:51,42.27083,-83.72639 1/9/09 14:37,Product1,1200,Visa,Nona,South Jordan ,UT,United States,1/8/09 15:14,2/7/09 19:11,40.56222,-111.92889 1/25/09 2:46,Product2,3600,Visa,Family,Dubai,Dubayy,United Arab Emirates,1/8/09 1:19,2/8/09 2:06,25.2522222,55.28 1/17/09 20:46,Product2,3600,Visa,Michelle,Dubai,Dubayy,United Arab Emirates,4/13/08 2:36,2/8/09 2:12,25.2522222,55.28 1/24/09 7:18,Product2,3600,Visa,Kathryn,Kirriemuir,Scotland,United Kingdom,1/23/09 10:31,2/8/09 2:52,56.6666667,-3 1/11/09 7:09,Product1,1200,Visa,Oswald,Tramore,Waterford,Ireland,10/13/08 16:43,2/8/09 3:02,52.1588889,-7.1463889 1/8/09 4:15,Product1,1200,Visa,Elyssa,Gdansk,Pomorskie,Poland,1/7/09 15:00,2/8/09 3:50,54.35,18.6666667 1/22/09 10:47,Product1,1200,Visa,michelle,Arklow,Wicklow,Ireland,11/18/08 1:32,2/8/09 5:07,52.7930556,-6.1413889 1/26/09 20:47,Product1,1200,Mastercard,Alicia,Lincoln ,NE,United States,6/24/08 8:05,2/8/09 7:29,40.8,-96.66667 1/12/09 12:22,Product1,1200,Mastercard,JP,Tierp,Uppsala,Sweden,1/6/09 11:34,2/8/09 11:15,60.3333333,17.5 1/26/09 1:44,Product2,3600,Visa,Geraldine,Brussels,Brussels (Bruxelles),Belgium,1/31/08 13:28,2/8/09 14:39,50.8333333,4.3333333 1/18/09 12:57,Product1,1200,Mastercard,sandra,Burr Oak ,IA,United States,1/24/08 16:11,2/8/09 15:30,43.45889,-91.86528 1/24/09 21:26,Product1,1200,Visa,Olivia,Wheaton ,IL,United States,5/8/08 16:02,2/8/09 16:00,41.86611,-88.10694 1/26/09 12:26,Product2,3600,Mastercard,Tom,Killeen ,TX,United States,1/26/09 5:23,2/8/09 17:33,31.11694,-97.7275 1/5/09 7:37,Product1,1200,Visa,Annette ,Manhattan ,NY,United States,9/26/08 4:29,2/8/09 18:42,40.71417,-74.00639 1/14/09 12:33,Product1,1200,Visa,SUSAN,Oxford,England,United Kingdom,9/11/08 23:23,2/8/09 23:00,51.75,-1.25 1/14/09 0:15,Product2,3600,Visa,Michael,Paris,Ile-de-France,France,11/28/08 0:07,2/9/09 1:30,48.8666667,2.3333333 1/1/09 12:42,Product1,1200,Visa,ashton,Exeter,England,United Kingdom,12/15/08 1:16,2/9/09 2:52,50.7,-3.5333333 1/6/09 6:07,Product1,1200,Visa,Scott,Rungsted,Frederiksborg,Denmark,12/27/08 14:29,2/9/09 4:20,55.8841667,12.5419444 1/15/09 5:11,Product2,3600,Visa,Pam,London,England,United Kingdom,7/11/06 12:43,2/9/09 4:42,51.52721,0.14559 1/17/09 4:03,Product1,1200,Visa,Lisa ,Borja,Bohol,Philippines,1/17/09 2:45,2/9/09 6:09,9.9136111,124.0927778 1/19/09 10:13,Product2,3600,Mastercard,Pavel,London,England,United Kingdom,2/28/06 5:35,2/9/09 6:57,51.51334,-0.08895 1/18/09 9:42,Product1,1200,Visa,Richard,Jamestown ,RI,United States,1/18/09 9:22,2/9/09 8:30,41.49694,-71.36778 1/9/09 11:14,Product1,1200,Visa,Jasinta Jeanne,Owings Mills ,MD,United States,1/9/09 10:43,2/9/09 9:17,39.41944,-76.78056 1/10/09 13:42,Product1,1200,Visa,Rachel,Hamilton,Ontario,Canada,1/10/09 12:22,2/9/09 9:54,43.25,-79.8333333 1/7/09 7:28,Product1,1200,Amex,Cherish ,Anchorage ,AK,United States,7/28/08 7:31,2/9/09 10:50,61.21806,-149.90028 1/18/09 6:46,Product1,1200,Visa,Shona ,Mornington,Meath,Ireland,1/15/09 9:13,2/9/09 11:55,53.7233333,-6.2825 1/30/09 12:18,Product1,1200,Mastercard,Abikay,Fullerton ,CA,United States,1/26/09 13:34,2/9/09 12:53,33.87028,-117.92444 1/6/09 5:42,Product1,1200,Amex,Abikay,Atlanta ,GA,United States,10/27/08 14:16,2/9/09 13:50,33.74889,-84.38806 1/2/09 10:58,Product2,3600,Visa,Kendra,Toronto,Ontario,Canada,1/2/09 10:38,2/9/09 13:56,43.6666667,-79.4166667 1/8/09 3:29,Product1,1200,Visa,amanda,Liverpool,England,United Kingdom,12/22/08 1:41,2/9/09 14:06,53.4166667,-3 1/12/09 13:23,Product2,3600,Amex,Leila,Ponte San Nicolo,Veneto,Italy,9/13/05 8:42,2/9/09 14:09,45.3666667,11.6166667 1/19/09 9:34,Product1,1200,Amex,amanda,Las Vegas ,NV,United States,5/10/08 8:56,2/9/09 16:44,36.175,-115.13639 1/9/09 7:49,Product1,1200,Visa,Stacy,Rochester Hills ,MI,United States,7/28/08 7:18,2/9/09 17:41,42.68056,-83.13389 1/15/09 5:27,Product2,3600,Visa,Derrick,North Bay,Ontario,Canada,1/6/09 17:42,2/9/09 18:22,46.3,-79.45 1/8/09 23:40,Product1,1200,Visa,Jacob,Lindfield,New South Wales,Australia,1/8/09 17:52,2/9/09 18:31,-33.7833333,151.1666667 1/27/09 11:02,Product1,1200,Mastercard,DOREEN,Madrid,Madrid,Spain,1/24/09 8:21,2/9/09 18:42,40.4,-3.6833333 1/14/09 13:23,Product1,1200,Diners,eugenia,Wisconsin Rapids ,WI,United States,11/15/08 13:57,2/9/09 18:44,44.38361,-89.81722 1/7/09 20:01,Product1,1200,Visa,Karen,Austin ,TX,United States,1/6/09 19:16,2/9/09 19:56,30.26694,-97.74278 1/20/09 12:32,Product1,1200,Visa,Bea,Chicago ,IL,United States,1/16/09 19:08,2/9/09 20:42,41.85,-87.65 1/6/09 14:35,Product1,1200,Diners,Hilde Karin,Las Vegas ,NV,United States,12/17/08 11:59,2/9/09 22:59,36.175,-115.13639 1/4/09 6:51,Product1,1200,Visa,Rima,Mullingar,Westmeath,Ireland,1/3/09 12:34,2/10/09 0:59,53.5333333,-7.35 1/24/09 18:30,Product1,1200,Visa,Ruangrote,Melbourne,Victoria,Australia,7/17/08 5:19,2/10/09 2:12,-37.8166667,144.9666667 1/25/09 5:57,Product1,1200,Amex,pamela,Ayacucho,Buenos Aires,Argentina,1/24/09 9:29,2/10/09 6:38,-37.15,-58.4833333 1/5/09 10:02,Product2,3600,Visa,Emillie,Eagan ,MN,United States,1/5/09 9:03,2/10/09 7:29,44.80417,-93.16667 1/13/09 9:14,Product1,1200,Visa,sangeeta,Vossevangen,Hordaland,Norway,1/9/09 9:31,2/10/09 9:04,60.6333333,6.4333333 1/22/09 7:35,Product1,1200,Visa,Anja,Ferney-Voltaire,Rhone-Alpes,France,1/22/09 6:51,2/10/09 9:18,46.25,6.1166667 1/2/09 11:06,Product1,1200,Mastercard,Andrew,Sevilla,Andalucia,Spain,3/12/06 15:02,2/10/09 10:04,37.3772222,-5.9869444 1/11/09 9:50,Product1,1200,Visa,Bato,Munchengosserstadt,Thuringia,Germany,1/7/09 11:45,2/10/09 10:28,51.05,11.65 1/21/09 20:44,Product1,1200,Mastercard,Ailsa ,Lindenhurst ,NY,United States,1/21/09 7:47,2/10/09 10:51,40.68667,-73.37389 1/5/09 9:09,Product1,1200,Visa,Sophie,Bloomfield ,MI,United States,10/23/06 6:52,2/10/09 10:58,42.53778,-83.23306 1/5/09 12:41,Product1,1200,Visa,Katrin,Calgary,Alberta,Canada,12/3/08 14:49,2/10/09 11:45,51.0833333,-114.0833333 1/28/09 12:54,Product2,3600,Mastercard,Kelly ,Vancouver,British Columbia,Canada,1/27/09 21:04,2/10/09 12:09,49.25,-123.1333333 1/21/09 4:46,Product1,1200,Visa,Tomasz,Klampenborg,Kobenhavn,Denmark,6/10/08 11:25,2/10/09 12:22,55.7666667,12.6 1/7/09 13:28,Product1,1200,Visa,Elizabeth,Calne,England,United Kingdom,1/4/09 13:07,2/10/09 12:39,51.4333333,-2 1/27/09 11:18,Product2,3600,Amex,Michael,Los Angeles ,CA,United States,1/23/09 11:47,2/10/09 13:09,34.05222,-118.24278 1/7/09 12:39,Product2,3600,Visa,Natasha,Milano,Lombardy,Italy,6/2/06 13:01,2/10/09 13:19,45.4666667,9.2 1/24/09 13:54,Product2,3600,Mastercard,Meredith,Kloten,Zurich,Switzerland,1/24/09 12:30,2/10/09 13:47,47.45,8.5833333 1/30/09 6:48,Product1,1200,Mastercard,Nicole,Fayetteville ,NC,United States,1/30/09 4:51,2/10/09 14:41,35.0525,-78.87861 1/22/09 18:07,Product1,1200,Visa,Ryan,Simpsonville ,SC,United States,1/6/09 16:59,2/10/09 15:30,34.73694,-82.25444 1/29/09 15:03,Product1,1200,Visa,Mary ,Auckland,Auckland,New Zealand,2/9/06 11:14,2/10/09 16:31,-36.8666667,174.7666667 1/2/09 14:14,Product1,1200,Diners,Aaron,Reading,England,United Kingdom,11/16/08 15:49,2/10/09 16:38,51.4333333,-1 1/19/09 11:05,Product1,1200,Visa,Bertrand,North Caldwell ,NJ,United States,10/3/08 5:55,2/10/09 18:16,40.83972,-74.27694 \ No newline at end of file diff --git a/test-data/example-bag.zip b/test-data/example-bag.zip new file mode 100644 index 000000000000..ee4de52c265b Binary files /dev/null and b/test-data/example-bag.zip differ diff --git a/test-data/testdir1.zip b/test-data/testdir1.zip new file mode 100644 index 000000000000..0f71e2ce96aa Binary files /dev/null and b/test-data/testdir1.zip differ diff --git a/test/api/test_dataset_collections.py b/test/api/test_dataset_collections.py index ca8950d8bb0c..87752d2b9096 100644 --- a/test/api/test_dataset_collections.py +++ b/test/api/test_dataset_collections.py @@ -188,6 +188,78 @@ def test_enforces_unique_names(self): create_response = self._post("dataset_collections", payload) self._assert_status_code_is(create_response, 400) + def test_upload_collection(self): + elements = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + "name": "Test upload", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + self.assertEquals(hdca["name"], "Test upload") + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "4.bed" + assert element0["object"]["file_size"] == 61 + + def test_upload_nested(self): + elements = [{"name": "samp1", "elements": [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}]}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list:list", + "name": "Test upload", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + self.assertEquals(hdca["name"], "Test upload") + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "samp1" + + def test_upload_collection_from_url(self): + elements = [{"src": "url", "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed", "info": "my cool bed"}] + targets = [{ + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + }] + payload = { + "history_id": self.history_id, + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "4.bed" + assert element0["object"]["file_size"] == 61 + + def _assert_one_collection_created_in_history(self): + contents_response = self._get("histories/%s/contents/dataset_collections" % self.history_id) + self._assert_status_code_is(contents_response, 200) + contents = contents_response.json() + assert len(contents) == 1 + hdca = contents[0] + assert hdca["history_content_type"] == "dataset_collection" + hdca_id = hdca["id"] + collection_response = self._get("histories/%s/contents/dataset_collections/%s" % (self.history_id, hdca_id)) + self._assert_status_code_is(collection_response, 200) + return collection_response.json() + def _check_create_response(self, create_response): self._assert_status_code_is(create_response, 200) dataset_collection = create_response.json() diff --git a/test/api/test_libraries.py b/test/api/test_libraries.py index 2a715f50fcbb..ae0303a8074f 100644 --- a/test/api/test_libraries.py +++ b/test/api/test_libraries.py @@ -1,3 +1,5 @@ +import json + from base import api from base.populators import ( DatasetCollectionPopulator, @@ -95,6 +97,94 @@ def test_create_dataset(self): assert library_dataset["peek"].find("create_test") >= 0 assert library_dataset["file_ext"] == "txt", library_dataset["file_ext"] + def test_fetch_upload_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + items = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert dataset["genome_build"] == "hg19", dataset + assert dataset["misc_info"] == "my cool bed", dataset + assert dataset["file_ext"] == "bed", dataset + + def test_fetch_zip_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "files", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(bed_test_data_path)} + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_single_url_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + items = [{"src": "url", "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_url_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + targets = [{ + "destination": destination, + "items_from": "archive", + "src": "url", + "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed.zip", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_bagit_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("bagit_archive") + example_bag_path = self.test_data_resolver.get_filename("example-bag.zip") + targets = [{ + "destination": destination, + "items_from": "bagit_archive", "src": "files", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(example_bag_path)}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/README.txt") + assert dataset["file_size"] == 66, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/bdbag-profile.json") + assert dataset["file_size"] == 723, dataset + + def _setup_fetch_to_folder(self, test_name): + return self.library_populator.setup_fetch_to_folder(test_name) + def test_create_dataset_in_folder(self): library = self.library_populator.new_private_library("ForCreateDatasets") folder_response = self._create_folder(library) diff --git a/test/api/test_tools_upload.py b/test/api/test_tools_upload.py index a05fe9dce49f..cb67b84507dd 100644 --- a/test/api/test_tools_upload.py +++ b/test/api/test_tools_upload.py @@ -1,3 +1,5 @@ +import json + from base import api from base.constants import ( ONE_TO_SIX_ON_WINDOWS, @@ -33,19 +35,28 @@ def test_upload1_paste_bad_datatype(self): self._assert_has_keys(create, 'err_msg') assert file_type in create['err_msg'] - def test_upload_posix_newline_fixes(self): + # upload1 rewrites content with posix lines by default but this can be disabled by setting + # to_posix_lines=None in the request. Newer fetch API does not do this by default prefering + # to keep content unaltered if possible but it can be enabled with a simple JSON boolean switch + # of the same name (to_posix_lines). + def test_upload_posix_newline_fixes_by_default(self): windows_content = ONE_TO_SIX_ON_WINDOWS result_content = self._upload_and_get_content(windows_content) self.assertEquals(result_content, ONE_TO_SIX_WITH_TABS) + def test_fetch_posix_unaltered(self): + windows_content = ONE_TO_SIX_ON_WINDOWS + result_content = self._upload_and_get_content(windows_content, api="fetch") + self.assertEquals(result_content, ONE_TO_SIX_ON_WINDOWS) + def test_upload_disable_posix_fix(self): windows_content = ONE_TO_SIX_ON_WINDOWS result_content = self._upload_and_get_content(windows_content, to_posix_lines=None) self.assertEquals(result_content, windows_content) - def test_upload_tab_to_space(self): - table = ONE_TO_SIX_WITH_SPACES - result_content = self._upload_and_get_content(table, space_to_tab="Yes") + def test_fetch_post_lines_option(self): + windows_content = ONE_TO_SIX_ON_WINDOWS + result_content = self._upload_and_get_content(windows_content, api="fetch", to_posix_lines=True) self.assertEquals(result_content, ONE_TO_SIX_WITH_TABS) def test_upload_tab_to_space_off_by_default(self): @@ -53,6 +64,21 @@ def test_upload_tab_to_space_off_by_default(self): result_content = self._upload_and_get_content(table) self.assertEquals(result_content, table) + def test_fetch_tab_to_space_off_by_default(self): + table = ONE_TO_SIX_WITH_SPACES + result_content = self._upload_and_get_content(table, api='fetch') + self.assertEquals(result_content, table) + + def test_upload_tab_to_space(self): + table = ONE_TO_SIX_WITH_SPACES + result_content = self._upload_and_get_content(table, space_to_tab="Yes") + self.assertEquals(result_content, ONE_TO_SIX_WITH_TABS) + + def test_fetch_tab_to_space(self): + table = ONE_TO_SIX_WITH_SPACES + result_content = self._upload_and_get_content(table, api="fetch", space_to_tab=True) + self.assertEquals(result_content, ONE_TO_SIX_WITH_TABS) + @skip_without_datatype("rdata") def test_rdata_not_decompressed(self): # Prevent regression of https://github.com/galaxyproject/galaxy/issues/753 @@ -60,6 +86,30 @@ def test_rdata_not_decompressed(self): rdata_metadata = self._upload_and_get_details(open(rdata_path, "rb"), file_type="auto") self.assertEquals(rdata_metadata["file_ext"], "rdata") + @skip_without_datatype("csv") + def test_csv_upload(self): + csv_path = TestDataResolver().get_filename("1.csv") + csv_metadata = self._upload_and_get_details(open(csv_path, "rb"), file_type="csv") + self.assertEquals(csv_metadata["file_ext"], "csv") + + @skip_without_datatype("csv") + def test_csv_upload_auto(self): + csv_path = TestDataResolver().get_filename("1.csv") + csv_metadata = self._upload_and_get_details(open(csv_path, "rb"), file_type="auto") + self.assertEquals(csv_metadata["file_ext"], "csv") + + @skip_without_datatype("csv") + def test_csv_fetch(self): + csv_path = TestDataResolver().get_filename("1.csv") + csv_metadata = self._upload_and_get_details(open(csv_path, "rb"), api="fetch", ext="csv", to_posix_lines=True) + self.assertEquals(csv_metadata["file_ext"], "csv") + + @skip_without_datatype("csv") + def test_csv_sniff_fetch(self): + csv_path = TestDataResolver().get_filename("1.csv") + csv_metadata = self._upload_and_get_details(open(csv_path, "rb"), api="fetch", ext="auto", to_posix_lines=True) + self.assertEquals(csv_metadata["file_ext"], "csv") + @skip_without_datatype("velvet") def test_composite_datatype(self): with self.dataset_populator.test_history() as history_id: @@ -113,6 +163,11 @@ def test_upload_dbkey(self): datasets = run_response.json()["outputs"] assert datasets[0].get("genome_build") == "hg19", datasets[0] + def test_fetch_dbkey(self): + table = ONE_TO_SIX_WITH_SPACES + details = self._upload_and_get_details(table, api='fetch', dbkey="hg19") + assert details.get("genome_build") == "hg19" + def test_upload_multiple_files_1(self): with self.dataset_populator.test_history() as history_id: payload = self.dataset_populator.upload_payload(history_id, "Test123", @@ -329,8 +384,23 @@ def _upload_and_get_details(self, content, **upload_kwds): history_id, new_dataset = self._upload(content, **upload_kwds) return self.dataset_populator.get_history_dataset_details(history_id, dataset=new_dataset) - def _upload(self, content, **upload_kwds): + def _upload(self, content, api="upload1", **upload_kwds): history_id = self.dataset_populator.new_history() - new_dataset = self.dataset_populator.new_dataset(history_id, content=content, **upload_kwds) + if api == "upload1": + new_dataset = self.dataset_populator.new_dataset(history_id, content=content, **upload_kwds) + else: + assert api == "fetch" + element = dict(src="files", **upload_kwds) + target = { + "destination": {"type": "hdas"}, + "elements": [element], + } + targets = json.dumps([target]) + payload = { + "history_id": history_id, + "targets": targets, + "__files": {"files_0|file_data": content} + } + new_dataset = self.dataset_populator.fetch(payload).json()["outputs"][0] self.dataset_populator.wait_for_history(history_id, assert_ok=upload_kwds.get("assert_ok", True)) return history_id, new_dataset diff --git a/test/base/driver_util.py b/test/base/driver_util.py index 638eb7f96250..7c31e7df7eee 100644 --- a/test/base/driver_util.py +++ b/test/base/driver_util.py @@ -198,6 +198,7 @@ def setup_galaxy_config( enable_beta_tool_formats=True, expose_dataset_path=True, file_path=file_path, + ftp_upload_purge=False, galaxy_data_manager_data_path=galaxy_data_manager_data_path, id_secret='changethisinproductiontoo', job_config_file=job_config_file, diff --git a/test/base/integration_util.py b/test/base/integration_util.py index 363b19a32bff..540d6bf1c108 100644 --- a/test/base/integration_util.py +++ b/test/base/integration_util.py @@ -8,6 +8,7 @@ from unittest import skip, TestCase from galaxy.tools.deps.commands import which +from galaxy.tools.verify.test_data import TestDataResolver from .api import UsesApiTestCaseMixin from .driver_util import GalaxyTestDriver @@ -56,6 +57,7 @@ def tearDownClass(cls): cls._app_available = False def setUp(self): + self.test_data_resolver = TestDataResolver() # Setup attributes needed for API testing... server_wrapper = self._test_driver.server_wrappers[0] host = server_wrapper.host diff --git a/test/base/populators.py b/test/base/populators.py index 9bff5a08e9fe..22d1d15d87d3 100644 --- a/test/base/populators.py +++ b/test/base/populators.py @@ -148,14 +148,30 @@ def new_dataset_request(self, history_id, content=None, wait=False, **kwds): self.wait_for_tool_run(history_id, run_response, assert_ok=kwds.get('assert_ok', True)) return run_response + def fetch(self, payload, assert_ok=True, timeout=DEFAULT_TIMEOUT): + tool_response = self._post("tools/fetch", data=payload) + if assert_ok: + job = self.check_run(tool_response) + self.wait_for_job(job["id"], timeout=timeout) + + job = tool_response.json()["jobs"][0] + details = self.get_job_details(job["id"]).json() + assert details["state"] == "ok", details + + return tool_response + def wait_for_tool_run(self, history_id, run_response, timeout=DEFAULT_TIMEOUT, assert_ok=True): - run = run_response.json() - assert run_response.status_code == 200, run - job = run["jobs"][0] + job = self.check_run(run_response) self.wait_for_job(job["id"], timeout=timeout) self.wait_for_history(history_id, assert_ok=assert_ok, timeout=timeout) return run_response + def check_run(self, run_response): + run = run_response.json() + assert run_response.status_code == 200, run + job = run["jobs"][0] + return job + def wait_for_history(self, history_id, assert_ok=False, timeout=DEFAULT_TIMEOUT): try: return wait_on_state(lambda: self._get("histories/%s" % history_id), assert_ok=assert_ok, timeout=timeout) @@ -266,8 +282,8 @@ def run_tool(self, tool_id, inputs, history_id, assert_ok=True, **kwds): else: return tool_response - def tools_post(self, payload): - tool_response = self._post("tools", data=payload) + def tools_post(self, payload, url="tools"): + tool_response = self._post(url, data=payload) return tool_response def get_history_dataset_content(self, history_id, wait=True, filename=None, **kwds): @@ -463,6 +479,11 @@ class LibraryPopulator(object): def __init__(self, galaxy_interactor): self.galaxy_interactor = galaxy_interactor + self.dataset_populator = DatasetPopulator(galaxy_interactor) + + def get_libraries(self): + get_response = self.galaxy_interactor.get("libraries") + return get_response.json() def new_private_library(self, name): library = self.new_library(name) @@ -514,6 +535,8 @@ def create_dataset_request(self, library, **kwds): "file_type": kwds.get("file_type", "auto"), "db_key": kwds.get("db_key", "?"), } + if kwds.get("link_data"): + create_data["link_data_only"] = "link_to_files" if upload_option == "upload_file": files = { @@ -532,7 +555,9 @@ def new_library_dataset(self, name, **create_dataset_kwds): library = self.new_private_library(name) payload, files = self.create_dataset_request(library, **create_dataset_kwds) dataset = self.raw_library_contents_create(library["id"], payload, files=files).json()[0] + return self.wait_on_library_dataset(library, dataset) + def wait_on_library_dataset(self, library, dataset): def show(): return self.galaxy_interactor.get("libraries/%s/contents/%s" % (library["id"], dataset["id"])) @@ -563,6 +588,24 @@ def show(): return library, library_dataset + def get_library_contents_with_path(self, library_id, path): + all_contents_response = self.galaxy_interactor.get("libraries/%s/contents" % library_id) + api_asserts.assert_status_code_is(all_contents_response, 200) + all_contents = all_contents_response.json() + matching = [c for c in all_contents if c["name"] == path] + if len(matching) == 0: + raise Exception("Failed to find library contents with path [%s], contents are %s" % (path, all_contents)) + get_response = self.galaxy_interactor.get(matching[0]["url"]) + api_asserts.assert_status_code_is(get_response, 200) + return get_response.json() + + def setup_fetch_to_folder(self, test_name): + history_id = self.dataset_populator.new_history() + library = self.new_private_library(test_name) + folder_id = library["root_folder_id"][1:] + destination = {"type": "library_folder", "library_folder_id": folder_id} + return history_id, library, destination + class BaseDatasetCollectionPopulator(object): diff --git a/test/functional/tools/sample_datatypes_conf.xml b/test/functional/tools/sample_datatypes_conf.xml index a37897791eed..29917074a99c 100644 --- a/test/functional/tools/sample_datatypes_conf.xml +++ b/test/functional/tools/sample_datatypes_conf.xml @@ -4,6 +4,7 @@ + @@ -14,6 +15,12 @@ + + + + + + diff --git a/test/integration/test_upload_configuration_options.py b/test/integration/test_upload_configuration_options.py index 441fd32f3d17..fa9548c230ae 100644 --- a/test/integration/test_upload_configuration_options.py +++ b/test/integration/test_upload_configuration_options.py @@ -19,6 +19,7 @@ framework but tested here for FTP uploads. """ +import json import os import re import shutil @@ -54,6 +55,59 @@ def setUp(self): self.library_populator = LibraryPopulator(self.galaxy_interactor) self.history_id = self.dataset_populator.new_history() + def fetch_target(self, target, assert_ok=False, attach_test_file=False): + payload = { + "history_id": self.history_id, + "targets": json.dumps([target]), + } + if attach_test_file: + payload["__files"] = {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))} + + response = self.dataset_populator.fetch(payload, assert_ok=assert_ok) + return response + + @classmethod + def temp_config_dir(cls, name): + # realpath here to get around problems with symlinks being blocked. + return os.path.realpath(os.path.join(cls._test_driver.galaxy_test_tmp_dir, name)) + + def _write_file(self, dir_path, content, filename="test"): + """Helper for writing ftp/server dir files.""" + self._ensure_directory(dir_path) + path = os.path.join(dir_path, filename) + with open(path, "w") as f: + f.write(content) + return path + + def _ensure_directory(self, path): + if not os.path.exists(path): + os.makedirs(path) + + +class InvalidFetchRequestsTestCase(BaseUploadContentConfigurationTestCase): + + def test_in_place_not_allowed(self): + elements = [{"src": "files", "in_place": False}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target, attach_test_file=True) + self._assert_status_code_is(response, 400) + assert 'in_place' in response.json()["err_msg"] + + def test_files_not_attached(self): + elements = [{"src": "files"}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 400) + assert 'Failed to find uploaded file matching target' in response.json()["err_msg"] + class NonAdminsCannotPasteFilePathTestCase(BaseUploadContentConfigurationTestCase): @@ -72,6 +126,8 @@ def test_disallowed_for_primary_file(self): @skip_without_datatype("velvet") def test_disallowed_for_composite_file(self): + path = os.path.join(TEST_DATA_DIRECTORY, "1.txt") + assert os.path.exists(path) payload = self.dataset_populator.upload_payload( self.history_id, "sequences content", @@ -79,7 +135,7 @@ def test_disallowed_for_composite_file(self): extra_inputs={ "files_1|url_paste": "roadmaps content", "files_1|type": "upload_dataset", - "files_2|url_paste": "file://%s/1.txt" % TEST_DATA_DIRECTORY, + "files_2|url_paste": "file://%s" % path, "files_2|type": "upload_dataset", }, ) @@ -87,12 +143,42 @@ def test_disallowed_for_composite_file(self): # Ideally this would be 403 but the tool API endpoint isn't using # the newer API decorator that handles those details. assert create_response.status_code >= 400 + assert os.path.exists(path) def test_disallowed_for_libraries(self): + path = os.path.join(TEST_DATA_DIRECTORY, "1.txt") + assert os.path.exists(path) library = self.library_populator.new_private_library("pathpastedisallowedlibraries") - payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_paths", paths="%s/1.txt" % TEST_DATA_DIRECTORY) + payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_paths", paths=path) response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) assert response.status_code == 403, response.json() + assert os.path.exists(path) + + def test_disallowed_for_fetch(self): + path = os.path.join(TEST_DATA_DIRECTORY, "1.txt") + assert os.path.exists(path) + elements = [{"src": "path", "path": path}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + assert os.path.exists(path) + + def test_disallowed_for_fetch_urls(self): + path = os.path.join(TEST_DATA_DIRECTORY, "1.txt") + assert os.path.exists(path) + elements = [{"src": "url", "url": "file://%s" % path}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + assert os.path.exists(path) class AdminsCanPasteFilePathsTestCase(BaseUploadContentConfigurationTestCase): @@ -113,10 +199,38 @@ def test_admin_path_paste(self): def test_admin_path_paste_libraries(self): library = self.library_populator.new_private_library("pathpasteallowedlibraries") - payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_paths", paths="%s/1.txt" % TEST_DATA_DIRECTORY) + path = "%s/1.txt" % TEST_DATA_DIRECTORY + assert os.path.exists(path) + payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_paths", paths=path) response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) # Was 403 for non-admin above. assert response.status_code == 200 + # Test regression where this was getting deleted in this mode. + assert os.path.exists(path) + + def test_admin_fetch(self): + path = os.path.join(TEST_DATA_DIRECTORY, "1.txt") + elements = [{"src": "path", "path": path}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + assert os.path.exists(path) + + def test_admin_fetch_file_url(self): + path = os.path.join(TEST_DATA_DIRECTORY, "1.txt") + elements = [{"src": "url", "url": "file://%s" % path}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + assert os.path.exists(path) class DefaultBinaryContentFiltersTestCase(BaseUploadContentConfigurationTestCase): @@ -212,6 +326,16 @@ def test_blocked_url_for_composite_file(self): # the newer API decorator that handles those details. assert create_response.status_code >= 400 + def test_blocked_url_for_fetch(self): + elements = [{"src": "url", "url": "http://localhost"}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 403) + class BaseFtpUploadConfigurationTestCase(BaseUploadContentConfigurationTestCase): @@ -228,7 +352,7 @@ def handle_extra_ftp_config(cls, config): @classmethod def ftp_dir(cls): - return os.path.join(cls._test_driver.galaxy_test_tmp_dir, "ftp") + return cls.temp_config_dir("ftp") def _check_content(self, dataset, content, ext="txt"): dataset = self.dataset_populator.get_history_dataset_details(self.history_id, dataset=dataset) @@ -251,6 +375,22 @@ def _ensure_directory(self, path): if not os.path.exists(path): os.makedirs(path) + def _run_purgable_upload(self): + # Purge setting is actually used with a fairly specific set of parameters - see: + # https://github.com/galaxyproject/galaxy/issues/5361 + content = "hello world\n" + ftp_path = self._write_ftp_file(content) + ftp_files = self.dataset_populator.get_remote_files() + assert len(ftp_files) == 1 + assert ftp_files[0]["path"] == "test" + assert os.path.exists(ftp_path) + # gotta set to_posix_lines to None currently to force purging of non-binary data. + dataset = self.dataset_populator.new_dataset( + self.history_id, ftp_files="test", file_type="txt", to_posix_lines=None, wait=True + ) + self._check_content(dataset, content) + return ftp_path + class SimpleFtpUploadConfigurationTestCase(BaseFtpUploadConfigurationTestCase): @@ -267,7 +407,29 @@ def test_ftp_upload(self): self.history_id, ftp_files="test", file_type="txt", to_posix_lines=None, wait=True ) self._check_content(dataset, content) - assert not os.path.exists(ftp_path) + + def test_ftp_fetch(self): + content = "hello world\n" + ftp_path = self._write_ftp_file(content) + ftp_files = self.dataset_populator.get_remote_files() + assert len(ftp_files) == 1, ftp_files + assert ftp_files[0]["path"] == "test" + assert os.path.exists(ftp_path) + elements = [{"src": "ftp_import", "ftp_path": ftp_files[0]["path"]}] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list", + "name": "cool collection", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + response_object = response.json() + assert "output_collections" in response_object + output_collections = response_object["output_collections"] + assert len(output_collections) == 1, response_object + dataset = self.dataset_populator.get_history_dataset_details(self.history_id, hid=2) + self._check_content(dataset, content) class ExplicitEmailAsIdentifierFtpUploadConfigurationTestCase(SimpleFtpUploadConfigurationTestCase): @@ -305,21 +467,66 @@ def handle_extra_ftp_config(cls, config): config["ftp_upload_purge"] = "False" def test_ftp_uploads_not_purged(self): - content = "hello world\n" - ftp_path = self._write_ftp_file(content) - ftp_files = self.dataset_populator.get_remote_files() - assert len(ftp_files) == 1 - assert ftp_files[0]["path"] == "test" - assert os.path.exists(ftp_path) - # gotta set to_posix_lines to None currently to force purging of non-binary data. - dataset = self.dataset_populator.new_dataset( - self.history_id, ftp_files="test", file_type="txt", to_posix_lines=None, wait=True - ) - self._check_content(dataset, content) + ftp_path = self._run_purgable_upload() # Purge is disabled, this better still be here. assert os.path.exists(ftp_path) +class EnableFtpPurgeUploadConfigurationTestCase(BaseFtpUploadConfigurationTestCase): + + @classmethod + def handle_extra_ftp_config(cls, config): + config["ftp_upload_purge"] = "True" + + def test_ftp_uploads_not_purged(self): + ftp_path = self._run_purgable_upload() + assert not os.path.exists(ftp_path) + + +class AdvancedFtpUploadFetchTestCase(BaseFtpUploadConfigurationTestCase): + + def test_fetch_ftp_directory(self): + dir_path = self._get_user_ftp_path() + self._write_file(os.path.join(dir_path, "subdir"), "content 1", filename="1") + self._write_file(os.path.join(dir_path, "subdir"), "content 22", filename="2") + self._write_file(os.path.join(dir_path, "subdir"), "content 333", filename="3") + target = { + "destination": {"type": "hdca"}, + "elements_from": "directory", + "src": "ftp_import", + "ftp_path": "subdir", + "collection_type": "list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + hdca = self.dataset_populator.get_history_collection_details(self.history_id, hid=1) + assert len(hdca["elements"]) == 3, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "1" + assert element0["object"]["file_size"] == 9 + + def test_fetch_nested_elements_from(self): + dir_path = self._get_user_ftp_path() + self._write_file(os.path.join(dir_path, "subdir1"), "content 1", filename="1") + self._write_file(os.path.join(dir_path, "subdir1"), "content 22", filename="2") + self._write_file(os.path.join(dir_path, "subdir2"), "content 333", filename="3") + elements = [ + {"name": "subdirel1", "src": "ftp_import", "ftp_path": "subdir1", "elements_from": "directory", "collection_type": "list"}, + {"name": "subdirel2", "src": "ftp_import", "ftp_path": "subdir2", "elements_from": "directory", "collection_type": "list"}, + ] + target = { + "destination": {"type": "hdca"}, + "elements": elements, + "collection_type": "list:list", + } + response = self.fetch_target(target) + self._assert_status_code_is(response, 200) + hdca = self.dataset_populator.get_history_collection_details(self.history_id, hid=1) + assert len(hdca["elements"]) == 2, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "subdirel1" + + class UploadOptionsFtpUploadConfigurationTestCase(BaseFtpUploadConfigurationTestCase): def test_upload_api_option_space_to_tab(self): @@ -428,7 +635,7 @@ def _copy_to_user_ftp_file(self, test_data_path): shutil.copyfile(input_path, os.path.join(target_dir, test_data_path)) def _write_user_ftp_file(self, path, content): - return self._write_ftp_file(content, filename=path) + return self._write_file(os.path.join(self.ftp_dir(), TEST_USER), content, filename=path) class ServerDirectoryOffByDefaultTestCase(BaseUploadContentConfigurationTestCase): @@ -448,27 +655,51 @@ def test_server_dir_uploads_403_if_dir_not_set(self): class ServerDirectoryValidUsageTestCase(BaseUploadContentConfigurationTestCase): + # This tests the library contents API - I think equivalent functionality is available via library datasets API + # and should also be tested. require_admin_user = True @classmethod def handle_galaxy_config_kwds(cls, config): - library_import_dir = os.path.join(cls._test_driver.galaxy_test_tmp_dir, "library_import_dir") - config["library_import_dir"] = library_import_dir - cls.dir_to_import = 'library' - full_dir_path = os.path.join(library_import_dir, cls.dir_to_import) + server_dir = cls.server_dir() + os.makedirs(server_dir) + config["library_import_dir"] = server_dir + + def test_valid_server_dir_uploads_okay(self): + dir_to_import = 'library' + full_dir_path = os.path.join(self.server_dir(), dir_to_import) os.makedirs(full_dir_path) - cls.file_content = "create_test" + file_content = "hello world\n" with tempfile.NamedTemporaryFile(dir=full_dir_path, delete=False) as fh: - fh.write(cls.file_content) - cls.file_to_import = fh.name + fh.write(file_content) + file_to_import = fh.name - def test_valid_server_dir_uploads_okay(self): - self.library_populator.new_library_dataset("serverdirupload", upload_option="upload_directory", server_dir=self.dir_to_import) + library_dataset = self.library_populator.new_library_dataset("serverdirupload", upload_option="upload_directory", server_dir=dir_to_import) # Check the file is still there and was not modified - with open(self.file_to_import) as fh: + with open(file_to_import) as fh: read_content = fh.read() - assert read_content == self.file_content + assert read_content == file_content + + assert library_dataset["file_size"] == 12, library_dataset + + def link_data_only(self): + content = "hello world\n" + dir_path = os.path.join(self.server_dir(), "lib1") + file_path = self._write_file(dir_path, content) + library = self.library_populator.new_private_library("serverdirupload") + # upload $GALAXY_ROOT/test-data/library + payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_directory", server_dir="lib1", link_data=True) + response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) + assert response.status_code == 200, response.json() + dataset = response.json()[0] + ok_dataset = self.library_populator.wait_on_library_dataset(library, dataset) + assert ok_dataset["file_size"] == 12, ok_dataset + assert ok_dataset["file_name"] == file_path, ok_dataset + + @classmethod + def server_dir(cls): + return cls.temp_config_dir("server") class ServerDirectoryRestrictedToAdminsUsageTestCase(BaseUploadContentConfigurationTestCase): @@ -483,3 +714,130 @@ def test_library_import_dir_not_available_to_non_admins(self): payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_directory", server_dir="library") response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) assert response.status_code == 403, response.json() + + +class FetchByPathTestCase(BaseUploadContentConfigurationTestCase): + + require_admin_user = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + config["allow_path_paste"] = True + + def test_fetch_path_to_folder(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("simple_fetch") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed") + assert os.path.exists(bed_test_data_path) + items = [{"src": "path", "path": bed_test_data_path, "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert os.path.exists(bed_test_data_path) + + def test_fetch_link_data_only(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("fetch_and_link") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed") + assert os.path.exists(bed_test_data_path) + items = [{"src": "path", "path": bed_test_data_path, "info": "my cool bed", "link_data_only": True}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert dataset["file_name"] == bed_test_data_path, dataset + assert os.path.exists(bed_test_data_path) + + def test_fetch_recursive_archive(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("recursive_archive") + archive_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "path", "path": archive_test_data_path, + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file2") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/dir1/file3") + assert dataset["file_size"] == 11, dataset + + def test_fetch_history_compressed_type(self): + destination = {"type": "hdas"} + archive = self.test_data_resolver.get_filename("1.fastqsanger.gz") + targets = [{ + "destination": destination, + "items": [{"src": "path", "path": archive, "ext": "fastqsanger.gz"}], + }] + payload = { + "history_id": self.history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + fetch_response = self.dataset_populator.fetch(payload) + self._assert_status_code_is(fetch_response, 200) + outputs = fetch_response.json()["outputs"] + assert len(outputs) == 1 + output = outputs[0] + assert output["name"] == "1.fastqsanger.gz" + contents_response = self.dataset_populator._get_contents_request(self.history_id) + assert contents_response.status_code == 200 + contents = contents_response.json() + assert len(contents) == 1, contents + assert contents[0]["extension"] == "fastqsanger.gz", contents[0] + assert contents[0]["name"] == "1.fastqsanger.gz", contents[0] + assert contents[0]["hid"] == 1, contents[0] + + def test_fetch_recursive_archive_history(self): + destination = {"type": "hdas"} + archive = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "path", "path": archive, + }] + payload = { + "history_id": self.history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + contents_response = self.dataset_populator._get_contents_request(self.history_id) + assert contents_response.status_code == 200 + contents = contents_response.json() + assert len(contents) == 3 + + def test_fetch_recursive_archive_to_library(self): + bed_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": {"type": "library", "name": "My Cool Library"}, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": self.history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + libraries = self.library_populator.get_libraries() + matching = [l for l in libraries if l["name"] == "My Cool Library"] + assert len(matching) == 1 + library = matching[0] + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset diff --git a/tools/data_source/upload.py b/tools/data_source/upload.py index 59d8f46036f0..4e0cdcdb0429 100644 --- a/tools/data_source/upload.py +++ b/tools/data_source/upload.py @@ -18,8 +18,12 @@ from galaxy import util from galaxy.datatypes import sniff -from galaxy.datatypes.binary import Binary from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) from galaxy.util.checkers import ( check_binary, check_bz2, @@ -36,12 +40,6 @@ assert sys.version_info[:2] >= (2, 7) -class UploadProblemException(Exception): - - def __init__(self, message): - self.message = message - - def file_err(msg, dataset, json_file): json_file.write(dumps(dict(type='dataset', ext='data', @@ -83,7 +81,10 @@ def add_file(dataset, registry, json_file, output_path): line_count = None converted_path = None stdout = None - link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files' + link_data_only_str = dataset.get('link_data_only', 'copy_files') + if link_data_only_str not in ['link_data_only', 'copy_files']: + raise UploadProblemException("Invalid setting for option link_data_only - upload request misconfigured.") + link_data_only = link_data_only_str == 'link_data_only' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their @@ -120,26 +121,21 @@ def add_file(dataset, registry, json_file, output_path): if dataset.type == 'url': try: - page = urlopen(dataset.path) # page will be .close()ed by sniff methods - temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) - dataset.path = temp_name + # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) + if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') + # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: - # Sniff the data type - guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) - # Set data_type only if guessed_ext is a binary datatype - datatype = registry.get_datatype_by_extension(guessed_ext) - if isinstance(datatype, Binary): - data_type = guessed_ext - ext = guessed_ext + data_type, ext = handle_sniffable_binary_check(data_type, ext, dataset.path, registry) if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): @@ -262,18 +258,9 @@ def add_file(dataset, registry, json_file, output_path): dataset.name = uncompressed_name data_type = 'zip' if not data_type: - if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - parts = dataset.name.split(".") - if len(parts) > 1: - ext = parts[-1].strip().lower() - is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) - if check_content and not is_ext_unsniffable_binary: - raise UploadProblemException('The uploaded binary file contains inappropriate content') - elif is_ext_unsniffable_binary and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) - raise UploadProblemException(err_msg) + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, dataset.path, dataset.name, is_binary, dataset.file_type, check_content, registry + ) if not data_type: # We must have a text file if check_content and check_html(dataset.path): @@ -290,7 +277,7 @@ def add_file(dataset, registry, json_file, output_path): else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': - ext = sniff.guess_ext(dataset.path, registry.sniff_order) + ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext @@ -302,7 +289,7 @@ def add_file(dataset, registry, json_file, output_path): if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) - if dataset.type in ('server_dir', 'path_paste') and link_data_only: + if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your Copy data into Galaxy? selection to be ' + \