Skip to content

Commit

Permalink
Hierarchical upload API optimized for folders & collections.
Browse files Browse the repository at this point in the history
Allows describing hierarchical data in JSON or inferring structure from archives or directories.

Datasets or archive sources can be specified via uploads, URLs, paths (if admin && allow_path_paste), library_import_dir/user_library_import_dir, and/or FTP imports. Unlike existing API endpoints, a mix of these on a per file basis is allowed and they work seemlessly between libraries and histories.

Supported "archives" include gzip, zip, bagit directories, bagit achives (with fetching and validations of downloads).

The existing upload API endpoint is quite rough to work with both in terms of adding parameters (e.g. the file type and dbkey hanlding in 4563 was difficult to implement, terribly hacky, and should seemingly have been trivial) and in terms of building requests (one needs to build a tool form - not describe sensible inputs in JSON). This API is built to be intelligable from an API standpoint instead of being constrained to the older style tool form. Additionally it built with hierarchical data in mind in a way that would not be easy at all enhancing the tool form components we don't even render.

This implements 5159 though much simpler YAML descriptions of data libraries should be possible basically as the API descriptions. We can replace the data library script in Ephemeris https://github.com/galaxyproject/ephemeris/blob/master/ephemeris/setup_data_libraries.py with one that converts a simple YAML file into an API call and allows many new options for free.

In future PRs I'll add filtering options to this and it will serve as the backend to 4733.
  • Loading branch information
jmchilton committed Jan 4, 2018
1 parent 9c94324 commit 35bb7e2
Show file tree
Hide file tree
Showing 29 changed files with 1,478 additions and 72 deletions.
5 changes: 1 addition & 4 deletions lib/galaxy/actions/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,7 @@ def _make_library_uploaded_dataset(self, trans, params, name, path, type, librar
uploaded_dataset.link_data_only = link_data_only
uploaded_dataset.uuid = uuid_str
if link_data_only == 'link_to_files':
uploaded_dataset.data.file_name = os.path.abspath(path)
# Since we are not copying the file into Galaxy's managed
# default file location, the dataset should never be purgable.
uploaded_dataset.data.dataset.purgable = False
uploaded_dataset.data.link_to(path)
trans.sa_session.add_all((uploaded_dataset.data, uploaded_dataset.data.dataset))
trans.sa_session.flush()
return uploaded_dataset
Expand Down
4 changes: 4 additions & 0 deletions lib/galaxy/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from galaxy import config, jobs
from galaxy.jobs import metrics as job_metrics
from galaxy.managers.collections import DatasetCollectionManager
from galaxy.managers.folders import FolderManager
from galaxy.managers.libraries import LibraryManager
from galaxy.managers.tags import GalaxyTagManager
from galaxy.openid.providers import OpenIDProviders
from galaxy.queue_worker import GalaxyQueueWorker
Expand Down Expand Up @@ -90,6 +92,8 @@ def __init__(self, **kwargs):
self.tag_handler = GalaxyTagManager(self.model.context)
# Dataset Collection Plugins
self.dataset_collections_service = DatasetCollectionManager(self)
self.library_folder_manager = FolderManager()
self.library_manager = LibraryManager()

# Tool Data Tables
self._configure_tool_data_tables(from_shed_config=False)
Expand Down
7 changes: 7 additions & 0 deletions lib/galaxy/datatypes/sniff.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import zipfile

from six import text_type
from six.moves.urllib.request import urlopen

from galaxy import util
from galaxy.util import compression_utils
Expand All @@ -39,6 +40,12 @@ def get_test_fname(fname):
return full_path


def stream_url_to_file(path):
page = urlopen(path) # page will be .close()ed in stream_to_file
temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
return temp_name


def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'):
"""Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor"""
# signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
Expand Down
47 changes: 47 additions & 0 deletions lib/galaxy/datatypes/upload_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from galaxy.datatypes import sniff
from galaxy.datatypes.binary import Binary


class UploadProblemException(Exception):

def __init__(self, message):
self.message = message


def handle_unsniffable_binary_check(data_type, ext, path, name, is_binary, requested_ext, check_content, registry):
"""Return modified values of data_type and ext if unsniffable binary encountered.
Throw UploadProblemException if content problems or extension mismatches occur.
Precondition: check_binary called returned True.
"""
if is_binary or registry.is_extension_unsniffable_binary(requested_ext):
# We have a binary dataset, but it is not Bam, Sff or Pdf
data_type = 'binary'
parts = name.split(".")
if len(parts) > 1:
ext = parts[-1].strip().lower()
is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext)
if check_content and not is_ext_unsniffable_binary:
raise UploadProblemException('The uploaded binary file contains inappropriate content')

elif is_ext_unsniffable_binary and requested_ext != ext:
err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext)
raise UploadProblemException(err_msg)
return data_type, ext


def handle_sniffable_binary_check(data_type, ext, path, registry):
"""Return modified values of data_type and ext if sniffable binary encountered.
Precondition: check_binary called returned True.
"""
# Sniff the data type
guessed_ext = sniff.guess_ext(path, registry.sniff_order)
# Set data_type only if guessed_ext is a binary datatype
datatype = registry.get_datatype_by_extension(guessed_ext)
if isinstance(datatype, Binary):
data_type = guessed_ext
ext = guessed_ext

return data_type, ext
1 change: 1 addition & 0 deletions lib/galaxy/dependencies/pinned-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pysam>=0.13
#python_lzo==1.8

# pure Python packages
bdbag==1.1.1
bz2file==0.98; python_version < '3.3'
ipaddress==1.0.18; python_version < '3.3'
boltons==17.1.0
Expand Down
2 changes: 1 addition & 1 deletion lib/galaxy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,7 +1371,7 @@ def path_rewriter(path):
collected_datasets = {
'primary': self.tool.collect_primary_datasets(out_data, self.get_tool_provided_job_metadata(), tool_working_directory, input_ext, input_dbkey)
}
self.tool.collect_dynamic_collections(
self.tool.collect_dynamic_outputs(
out_collections,
self.get_tool_provided_job_metadata(),
job_working_directory=tool_working_directory,
Expand Down
19 changes: 12 additions & 7 deletions lib/galaxy/managers/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,22 @@ def __init__(self, app):
self.tag_manager = tags.GalaxyTagManager(app.model.context)
self.ldda_manager = lddas.LDDAManager(app)

def precreate_dataset_collection_instance(self, trans, parent, name, implicit_inputs, implicit_output_name, structure):
def precreate_dataset_collection_instance(self, trans, parent, name, structure, implicit_inputs=None, implicit_output_name=None):
# TODO: prebuild all required HIDs and send them in so no need to flush in between.
dataset_collection = self.precreate_dataset_collection(structure)
dataset_collection = self.precreate_dataset_collection(structure, allow_unitialized_element=implicit_output_name is not None)
instance = self._create_instance_for_collection(
trans, parent, name, dataset_collection, implicit_inputs=implicit_inputs, implicit_output_name=implicit_output_name, flush=False
)
return instance

def precreate_dataset_collection(self, structure):
if structure.is_leaf or not structure.children_known:
return model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
def precreate_dataset_collection(self, structure, allow_unitialized_element=True):
has_structure = not structure.is_leaf and structure.children_known
if not has_structure and allow_unitialized_element:
dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
elif not has_structure:
collection_type_description = structure.collection_type_description
dataset_collection = model.DatasetCollection(populated=False)
dataset_collection.collection_type = collection_type_description.collection_type
else:
collection_type_description = structure.collection_type_description
dataset_collection = model.DatasetCollection(populated=False)
Expand All @@ -67,7 +72,7 @@ def precreate_dataset_collection(self, structure):
if substructure.is_leaf:
element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
else:
element = self.precreate_dataset_collection(substructure)
element = self.precreate_dataset_collection(substructure, allow_unitialized_element=allow_unitialized_element)

element = model.DatasetCollectionElement(
element=element,
Expand All @@ -78,7 +83,7 @@ def precreate_dataset_collection(self, structure):
dataset_collection.elements = elements
dataset_collection.element_count = len(elements)

return dataset_collection
return dataset_collection

def create(self, trans, parent, name, collection_type, element_identifiers=None,
elements=None, implicit_collection_info=None, trusted_identifiers=None,
Expand Down
6 changes: 6 additions & 0 deletions lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2032,6 +2032,12 @@ def set_file_name(self, filename):
return self.dataset.set_file_name(filename)
file_name = property(get_file_name, set_file_name)

def link_to(self, path):
self.file_name = os.path.abspath(path)
# Since we are not copying the file into Galaxy's managed
# default file location, the dataset should never be purgable.
self.dataset.purgable = False

@property
def extra_files_path(self):
return self.dataset.extra_files_path
Expand Down
12 changes: 8 additions & 4 deletions lib/galaxy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
# Tools that require Galaxy's Python environment to be preserved.
GALAXY_LIB_TOOLS_UNVERSIONED = [
"upload1",
"__DATA_FETCH__",
# Legacy tools bundled with Galaxy.
"vcf_to_maf_customtrack1",
"laj_1",
Expand Down Expand Up @@ -1041,7 +1042,10 @@ def parse_input_elem(self, page_source, enctypes, context=None):
group.file_type_name = elem.get('file_type_name', group.file_type_name)
group.default_file_type = elem.get('default_file_type', group.default_file_type)
group.metadata_ref = elem.get('metadata_ref', group.metadata_ref)
rval[group.file_type_name].refresh_on_change = True
try:
rval[group.file_type_name].refresh_on_change = True
except KeyError:
pass
group_page_source = XmlPageSource(elem)
group.inputs = self.parse_input_elem(group_page_source, enctypes, context)
rval[group.name] = group
Expand Down Expand Up @@ -1592,10 +1596,10 @@ def collect_primary_datasets(self, output, tool_provided_metadata, job_working_d
"""
return output_collect.collect_primary_datasets(self, output, tool_provided_metadata, job_working_directory, input_ext, input_dbkey=input_dbkey)

def collect_dynamic_collections(self, output, tool_provided_metadata, **kwds):
""" Find files corresponding to dynamically structured collections.
def collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds):
"""Collect dynamic outputs associated with a job from this tool.
"""
return output_collect.collect_dynamic_collections(self, output, tool_provided_metadata, **kwds)
return output_collect.collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds)

def to_archive(self):
tool = self
Expand Down
58 changes: 58 additions & 0 deletions lib/galaxy/tools/actions/upload.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import logging

from galaxy.exceptions import RequestParameterMissingException
from galaxy.tools.actions import upload_common
from galaxy.util import ExecutionTimer
from . import ToolAction
Expand Down Expand Up @@ -36,3 +38,59 @@ def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, *
rval = upload_common.create_job(trans, incoming, tool, json_file_path, data_list, history=history)
log.debug("Created upload job %s" % create_job_timer)
return rval


class FetchUploadToolAction(ToolAction):

def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, **kwargs):
dataset_upload_inputs = []
for input_name, input in tool.inputs.items():
if input.type == "upload_dataset":
dataset_upload_inputs.append(input)
assert dataset_upload_inputs, Exception("No dataset upload groups were found.")

persisting_uploads_timer = ExecutionTimer()
# precreated_datasets = upload_common.get_precreated_datasets(trans, incoming, trans.app.model.HistoryDatasetAssociation)
incoming = upload_common.persist_uploads(incoming, trans)
log.debug("Persisted uploads %s" % persisting_uploads_timer)

# Now replace references in requests with these.
files = incoming.get("files", [])
files_iter = iter(files)
request = json.loads(incoming.get("request_json"))

def replace_file_srcs(request_part):
if isinstance(request_part, dict):
if request_part.get("src", None) == "files":
path_def = next(files_iter)
if path_def is None or path_def["file_data"] is None:
raise RequestParameterMissingException("Failed to find uploaded file matching target with src='files'")
request_part["path"] = path_def["file_data"]["local_filename"]
if "name" not in request_part:
request_part["name"] = path_def["file_data"]["filename"]
request_part["src"] = "path"
else:
for key, value in request_part.items():
replace_file_srcs(value)
elif isinstance(request_part, list):
for value in request_part:
replace_file_srcs(value)

replace_file_srcs(request)

incoming["request_json"] = json.dumps(request)
log.info("incoming are %s" % incoming)
# We can pass an empty string as the cntrller here since it is used to check whether we
# are in an admin view, and this tool is currently not used there.
check_and_cleanup_timer = ExecutionTimer()
# uploaded_datasets = upload_common.get_uploaded_datasets(trans, '', incoming, precreated_datasets, dataset_upload_inputs, history=history)
# upload_common.cleanup_unused_precreated_datasets(precreated_datasets)

# if not uploaded_datasets:
# return None, 'No data was entered in the upload form, please go back and choose data to upload.'

log.debug("Checked and cleaned uploads %s" % check_and_cleanup_timer)
create_job_timer = ExecutionTimer()
rval = upload_common.create_job(trans, incoming, tool, None, [], history=history)
log.debug("Created upload job %s" % create_job_timer)
return rval
12 changes: 8 additions & 4 deletions lib/galaxy/tools/actions/upload_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from urllib.parse import urlparse

from galaxy import datatypes, util
from galaxy.exceptions import ObjectInvalid
from galaxy.exceptions import ConfigDoesNotAllowException, ObjectInvalid
from galaxy.managers import tags
from galaxy.util import unicodify
from galaxy.util.odict import odict
Expand Down Expand Up @@ -102,7 +102,7 @@ def validate_url(url, ip_whitelist):
pass
else:
# Otherwise, we deny access.
raise Exception("Access to this address in not permitted by server configuration")
raise ConfigDoesNotAllowException("Access to this address in not permitted by server configuration")
return url


Expand All @@ -123,7 +123,7 @@ def persist_uploads(params, trans):
local_filename=local_filename)
elif type(f) == dict and 'local_filename' not in f:
raise Exception('Uploaded file was encoded in a way not understood by Galaxy.')
if upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '':
if 'url_paste' in upload_dataset and upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '':
upload_dataset['url_paste'] = datatypes.sniff.stream_to_file(
StringIO(validate_url(upload_dataset['url_paste'], trans.app.config.fetch_url_whitelist_ips)),
prefix="strio_url_paste_"
Expand Down Expand Up @@ -334,7 +334,11 @@ def new_upload(trans, cntrller, uploaded_dataset, library_bunch=None, history=No
def get_uploaded_datasets(trans, cntrller, params, precreated_datasets, dataset_upload_inputs, library_bunch=None, history=None):
uploaded_datasets = []
for dataset_upload_input in dataset_upload_inputs:
uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params))
try:
uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params))
except AttributeError:
# TODO: refine...
pass
for uploaded_dataset in uploaded_datasets:
data = get_precreated_dataset(precreated_datasets, uploaded_dataset.name)
if not data:
Expand Down
Loading

0 comments on commit 35bb7e2

Please sign in to comment.