Skip to content

Commit

Permalink
Merge pull request #5220 from jmchilton/upload_2.0
Browse files Browse the repository at this point in the history
Hierarchical upload API optimized for folders & collections.
  • Loading branch information
bgruening authored Mar 9, 2018
2 parents 300fffe + bca2c3c commit 2195911
Show file tree
Hide file tree
Showing 37 changed files with 2,040 additions and 256 deletions.
6 changes: 2 additions & 4 deletions lib/galaxy/actions/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,14 @@ def _make_library_uploaded_dataset(self, trans, params, name, path, type, librar
uploaded_dataset.to_posix_lines = params.get('to_posix_lines', None)
uploaded_dataset.space_to_tab = params.get('space_to_tab', None)
uploaded_dataset.tag_using_filenames = params.get('tag_using_filenames', True)
uploaded_dataset.purge_source = getattr(trans.app.config, 'ftp_upload_purge', True)
if in_folder:
uploaded_dataset.in_folder = in_folder
uploaded_dataset.data = upload_common.new_upload(trans, 'api', uploaded_dataset, library_bunch)
uploaded_dataset.link_data_only = link_data_only
uploaded_dataset.uuid = uuid_str
if link_data_only == 'link_to_files':
uploaded_dataset.data.file_name = os.path.abspath(path)
# Since we are not copying the file into Galaxy's managed
# default file location, the dataset should never be purgable.
uploaded_dataset.data.dataset.purgable = False
uploaded_dataset.data.link_to(path)
trans.sa_session.add_all((uploaded_dataset.data, uploaded_dataset.data.dataset))
trans.sa_session.flush()
return uploaded_dataset
Expand Down
4 changes: 4 additions & 0 deletions lib/galaxy/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from galaxy import config, jobs
from galaxy.jobs import metrics as job_metrics
from galaxy.managers.collections import DatasetCollectionManager
from galaxy.managers.folders import FolderManager
from galaxy.managers.histories import HistoryManager
from galaxy.managers.libraries import LibraryManager
from galaxy.managers.tags import GalaxyTagManager
from galaxy.openid.providers import OpenIDProviders
from galaxy.queue_worker import GalaxyQueueWorker
Expand Down Expand Up @@ -96,6 +98,8 @@ def __init__(self, **kwargs):
self.history_manager = HistoryManager(self)
self.dependency_resolvers_view = DependencyResolversView(self)
self.test_data_resolver = test_data.TestDataResolver(file_dirs=self.config.tool_test_data_directories)
self.library_folder_manager = FolderManager()
self.library_manager = LibraryManager()

# Tool Data Tables
self._configure_tool_data_tables(from_shed_config=False)
Expand Down
22 changes: 17 additions & 5 deletions lib/galaxy/datatypes/sniff.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import zipfile

from six import text_type
from six.moves.urllib.request import urlopen

from galaxy import util
from galaxy.util import compression_utils
Expand All @@ -39,6 +40,12 @@ def get_test_fname(fname):
return full_path


def stream_url_to_file(path):
page = urlopen(path) # page will be .close()ed in stream_to_file
temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
return temp_name


def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'):
"""Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor"""
# signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
Expand Down Expand Up @@ -131,7 +138,7 @@ def convert_newlines(fname, in_place=True, tmp_dir=None, tmp_prefix="gxupload"):
return (i, temp_name)


def sep2tabs(fname, in_place=True, patt="\\s+"):
def sep2tabs(fname, in_place=True, patt="\\s+", tmp_dir=None, tmp_prefix="gxupload"):
"""
Transforms in place a 'sep' separated file to a tab separated one
Expand All @@ -143,13 +150,18 @@ def sep2tabs(fname, in_place=True, patt="\\s+"):
'1\\t2\\n3\\t4\\n'
"""
regexp = re.compile(patt)
fd, temp_name = tempfile.mkstemp()
fd, temp_name = tempfile.mkstemp(prefix=tmp_prefix, dir=tmp_dir)
with os.fdopen(fd, "wt") as fp:
i = None
for i, line in enumerate(open(fname)):
line = line.rstrip('\r\n')
elems = regexp.split(line)
fp.write("%s\n" % '\t'.join(elems))
if line.endswith("\r"):
line = line.rstrip('\r')
elems = regexp.split(line)
fp.write("%s\r" % '\t'.join(elems))
else:
line = line.rstrip('\n')
elems = regexp.split(line)
fp.write("%s\n" % '\t'.join(elems))
if i is None:
i = 0
else:
Expand Down
47 changes: 47 additions & 0 deletions lib/galaxy/datatypes/upload_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from galaxy.datatypes import sniff
from galaxy.datatypes.binary import Binary


class UploadProblemException(Exception):

def __init__(self, message):
self.message = message


def handle_unsniffable_binary_check(data_type, ext, path, name, is_binary, requested_ext, check_content, registry):
"""Return modified values of data_type and ext if unsniffable binary encountered.
Throw UploadProblemException if content problems or extension mismatches occur.
Precondition: check_binary called returned True.
"""
if is_binary or registry.is_extension_unsniffable_binary(requested_ext):
# We have a binary dataset, but it is not Bam, Sff or Pdf
data_type = 'binary'
parts = name.split(".")
if len(parts) > 1:
ext = parts[-1].strip().lower()
is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext)
if check_content and not is_ext_unsniffable_binary:
raise UploadProblemException('The uploaded binary file contains inappropriate content')

elif is_ext_unsniffable_binary and requested_ext != ext:
err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext)
raise UploadProblemException(err_msg)
return data_type, ext


def handle_sniffable_binary_check(data_type, ext, path, registry):
"""Return modified values of data_type and ext if sniffable binary encountered.
Precondition: check_binary called returned True.
"""
# Sniff the data type
guessed_ext = sniff.guess_ext(path, registry.sniff_order)
# Set data_type only if guessed_ext is a binary datatype
datatype = registry.get_datatype_by_extension(guessed_ext)
if isinstance(datatype, Binary):
data_type = guessed_ext
ext = guessed_ext

return data_type, ext
1 change: 1 addition & 0 deletions lib/galaxy/dependencies/pinned-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ uWSGI==2.0.15
pysam==0.14

# pure Python packages
bdbag==1.1.1
bleach==2.1.3
bz2file==0.98; python_version < '3.3'
ipaddress==1.0.18; python_version < '3.3'
Expand Down
2 changes: 1 addition & 1 deletion lib/galaxy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1380,7 +1380,7 @@ def path_rewriter(path):
collected_datasets = {
'primary': self.tool.collect_primary_datasets(out_data, self.get_tool_provided_job_metadata(), tool_working_directory, input_ext, input_dbkey)
}
self.tool.collect_dynamic_collections(
self.tool.collect_dynamic_outputs(
out_collections,
self.get_tool_provided_job_metadata(),
job_working_directory=tool_working_directory,
Expand Down
19 changes: 12 additions & 7 deletions lib/galaxy/managers/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,22 @@ def __init__(self, app):
self.tag_manager = tags.GalaxyTagManager(app.model.context)
self.ldda_manager = lddas.LDDAManager(app)

def precreate_dataset_collection_instance(self, trans, parent, name, implicit_inputs, implicit_output_name, structure):
def precreate_dataset_collection_instance(self, trans, parent, name, structure, implicit_inputs=None, implicit_output_name=None):
# TODO: prebuild all required HIDs and send them in so no need to flush in between.
dataset_collection = self.precreate_dataset_collection(structure)
dataset_collection = self.precreate_dataset_collection(structure, allow_unitialized_element=implicit_output_name is not None)
instance = self._create_instance_for_collection(
trans, parent, name, dataset_collection, implicit_inputs=implicit_inputs, implicit_output_name=implicit_output_name, flush=False
)
return instance

def precreate_dataset_collection(self, structure):
if structure.is_leaf or not structure.children_known:
return model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
def precreate_dataset_collection(self, structure, allow_unitialized_element=True):
has_structure = not structure.is_leaf and structure.children_known
if not has_structure and allow_unitialized_element:
dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
elif not has_structure:
collection_type_description = structure.collection_type_description
dataset_collection = model.DatasetCollection(populated=False)
dataset_collection.collection_type = collection_type_description.collection_type
else:
collection_type_description = structure.collection_type_description
dataset_collection = model.DatasetCollection(populated=False)
Expand All @@ -67,7 +72,7 @@ def precreate_dataset_collection(self, structure):
if substructure.is_leaf:
element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
else:
element = self.precreate_dataset_collection(substructure)
element = self.precreate_dataset_collection(substructure, allow_unitialized_element=allow_unitialized_element)

element = model.DatasetCollectionElement(
element=element,
Expand All @@ -78,7 +83,7 @@ def precreate_dataset_collection(self, structure):
dataset_collection.elements = elements
dataset_collection.element_count = len(elements)

return dataset_collection
return dataset_collection

def create(self, trans, parent, name, collection_type, element_identifiers=None,
elements=None, implicit_collection_info=None, trusted_identifiers=None,
Expand Down
6 changes: 6 additions & 0 deletions lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2035,6 +2035,12 @@ def set_file_name(self, filename):
return self.dataset.set_file_name(filename)
file_name = property(get_file_name, set_file_name)

def link_to(self, path):
self.file_name = os.path.abspath(path)
# Since we are not copying the file into Galaxy's managed
# default file location, the dataset should never be purgable.
self.dataset.purgable = False

@property
def extra_files_path(self):
return self.dataset.extra_files_path
Expand Down
12 changes: 8 additions & 4 deletions lib/galaxy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
# Tools that require Galaxy's Python environment to be preserved.
GALAXY_LIB_TOOLS_UNVERSIONED = [
"upload1",
"__DATA_FETCH__",
# Legacy tools bundled with Galaxy.
"vcf_to_maf_customtrack1",
"laj_1",
Expand Down Expand Up @@ -1107,7 +1108,10 @@ def parse_input_elem(self, page_source, enctypes, context=None):
group.file_type_name = elem.get('file_type_name', group.file_type_name)
group.default_file_type = elem.get('default_file_type', group.default_file_type)
group.metadata_ref = elem.get('metadata_ref', group.metadata_ref)
rval[group.file_type_name].refresh_on_change = True
try:
rval[group.file_type_name].refresh_on_change = True
except KeyError:
pass
group_page_source = XmlPageSource(elem)
group.inputs = self.parse_input_elem(group_page_source, enctypes, context)
rval[group.name] = group
Expand Down Expand Up @@ -1658,10 +1662,10 @@ def collect_primary_datasets(self, output, tool_provided_metadata, job_working_d
"""
return output_collect.collect_primary_datasets(self, output, tool_provided_metadata, job_working_directory, input_ext, input_dbkey=input_dbkey)

def collect_dynamic_collections(self, output, tool_provided_metadata, **kwds):
""" Find files corresponding to dynamically structured collections.
def collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds):
"""Collect dynamic outputs associated with a job from this tool.
"""
return output_collect.collect_dynamic_collections(self, output, tool_provided_metadata, **kwds)
return output_collect.collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds)

def to_archive(self):
tool = self
Expand Down
Loading

0 comments on commit 2195911

Please sign in to comment.