From 537648d273f5d883c190a171326c776f76653d2c Mon Sep 17 00:00:00 2001 From: Arjun Arkal Rao Date: Sun, 5 Mar 2017 15:22:16 -0800 Subject: [PATCH] Cleanup codebase (resolves #169) resolves #169 resolves #163 Multiple docstrings were outdated and very few were sphinx-compatible. Made entire codebase PEP8 compliant (except for 100-char lines) renamed pipeline_launchpad to lanch_protect --- src/protect/addons.py | 28 ++- src/protect/alignment/common.py | 19 +- src/protect/alignment/dna.py | 100 ++++---- src/protect/alignment/rna.py | 61 +++-- src/protect/binding_prediction/__init__.py | 2 +- src/protect/binding_prediction/common.py | 155 +++++------- src/protect/binding_prediction/mhci.py | 19 +- src/protect/binding_prediction/mhcii.py | 24 +- src/protect/common.py | 197 ++++++--------- src/protect/expression_profiling/__init__.py | 2 +- src/protect/expression_profiling/rsem.py | 42 ++-- src/protect/haplotyping/__init__.py | 2 +- src/protect/haplotyping/phlat.py | 76 +++--- src/protect/mutation_annotation/__init__.py | 2 +- src/protect/mutation_annotation/snpeff.py | 35 ++- src/protect/mutation_calling/__init__.py | 2 +- src/protect/mutation_calling/common.py | 80 +++--- src/protect/mutation_calling/fusion.py | 9 +- src/protect/mutation_calling/indel.py | 10 +- src/protect/mutation_calling/muse.py | 128 +++++----- src/protect/mutation_calling/mutect.py | 116 ++++----- src/protect/mutation_calling/radia.py | 163 ++++++------ src/protect/mutation_calling/somaticsniper.py | 140 +++++------ src/protect/mutation_calling/strelka.py | 130 +++++----- src/protect/mutation_translation.py | 45 ++-- src/protect/pipeline/ProTECT.py | 237 +++++++++--------- src/protect/qc/__init__.py | 2 +- src/protect/qc/rna.py | 24 +- src/protect/rankboost.py | 37 ++- src/protect/test/ci/test_protect.py | 3 +- 30 files changed, 915 insertions(+), 975 deletions(-) diff --git a/src/protect/addons.py b/src/protect/addons.py index df5a6430..e2a4dd9b 100644 --- a/src/protect/addons.py +++ b/src/protect/addons.py @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function + from collections import Counter + from protect.common import export_results, get_files_from_filestore, untargz from protect.haplotyping.phlat import parse_phlat_file @@ -23,13 +25,14 @@ def run_mhc_gene_assessment(job, rsem_files, rna_haplotype, univ_options, mhc_genes_options): """ - This is a convenience function that runs assess_mhc_genes. - :param job job: + A wrapper for assess_mhc_genes. + :param dict rsem_files: Results form running rsem :param str rna_haplotype: The job store id for the rna haplotype file - :param dict univ_options: Universal Options + :param dict univ_options: Dict of universal options used by almost all tools :param dict mhc_genes_options: Options specific to assessing the MHC genes :return: The results of running assess_mhc_genes + :rtype: toil.fileStore.FileID """ return job.addChildJobFn(assess_mhc_genes, rsem_files['rsem.isoforms.results'], rna_haplotype, univ_options, mhc_genes_options).rv() @@ -37,12 +40,15 @@ def run_mhc_gene_assessment(job, rsem_files, rna_haplotype, univ_options, mhc_ge def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_genes_options): """ - This module will assess the prevalence of the various genes in the MHC pathway and return a - report in the tsv format - :param isoform_expression: Isoform expression from run_rsem - :param rna_haplotype: PHLAT output from running on rna - :param univ_options: Universal options for the pipeline - :param mhc_genes_options: options specific to this module + Assess the prevalence of the various genes in the MHC pathway and return a report in the tsv + format. + + :param toil.fileStore.FileID isoform_expression: fsID for the rsem isoform expression file + :param toil.fileStore.FileID rna_haplotype: fsID for the RNA PHLAT file + :param dict univ_options: Dict of universal options used by almost all tools + :param dict mhc_genes_options: Options specific to assessing the MHC genes + :return: The fsID for the mhc pathway report file + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running mhc gene assessment on %s' % univ_options['patient']) work_dir = os.getcwd() @@ -91,7 +97,9 @@ def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_g for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRA', 'HLA_DRB'): if mhcii_allele != 'HLA_DRA': num_alleles = len(mhc_alleles[mhcii_allele]) - result = 'FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS' + result = ('FAIL' if num_alleles == 0 else + 'LOW' if num_alleles == 1 else + 'PASS') print("{:12}{:<12}{:<12}{:12}".format(mhcii_allele, 2, num_alleles, result), file=mpr) else: diff --git a/src/protect/alignment/common.py b/src/protect/alignment/common.py index 12e63760..0d07468c 100644 --- a/src/protect/alignment/common.py +++ b/src/protect/alignment/common.py @@ -14,7 +14,7 @@ # limitations under the License. from __future__ import absolute_import from math import ceil -from protect.common import docker_call, get_files_from_filestore, export_results +from protect.common import docker_call, export_results, get_files_from_filestore import os @@ -28,14 +28,15 @@ def index_bamfile(job, bamfile, sample_type, univ_options, samtools_options): """ This module indexes BAMFILE ARGUMENTS - 1. bamfile: - 2. sample_type: string of 'tumor_dna' or 'normal_dna' - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - RETURN VALUES - 1. output_files: REFER output_files in run_bwa(). This module is the one is - the one that generates the files. + :param toil.fileStore.FileID bamfile: fsID for the bam file + :param str sample_type: Description of the sample to inject into the filename + :param dict univ_options: Dict of universal options used by almost all tools + :param dict samtools_options: Options specific to samtools + :return: Dict containing input bam and the generated index (.bam.bai) + output_files: + |- '_fix_pg_sorted.bam': fsID + +- '_fix_pg_sorted.bam.bai': fsID + :rtype: dict """ job.fileStore.logToMaster('Running samtools-index on %s:%s' % (univ_options['patient'], sample_type)) diff --git a/src/protect/alignment/dna.py b/src/protect/alignment/dna.py index 586d3578..f529d533 100644 --- a/src/protect/alignment/dna.py +++ b/src/protect/alignment/dna.py @@ -14,6 +14,7 @@ # limitations under the License. from __future__ import absolute_import, print_function from math import ceil + from protect.alignment.common import index_bamfile, index_disk from protect.common import docker_call, docker_path, get_files_from_filestore, is_gzipfile, untargz from toil.job import PromisedRequirement @@ -41,7 +42,17 @@ def regroup_disk(reheader_bam): def align_dna(job, fastqs, sample_type, univ_options, bwa_options): """ - This is a convenience function that runs the entire dna alignment subgraph + A wrapper for the entire dna alignment subgraph. + + :param list fastqs: The input fastqs for alignment + :param str sample_type: Description of the sample to inject into the filename + :param dict univ_options: Dict of universal options used by almost all tools + :param dict bwa_options: Options specific to bwa + :return: Dict containing output bam and bai + output_files: + |- '_fix_pg_sorted.bam': fsID + +- '_fix_pg_sorted.bam.bai': fsID + :rtype: dict """ bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options, disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['index']), @@ -69,28 +80,14 @@ def align_dna(job, fastqs, sample_type, univ_options, bwa_options): def run_bwa(job, fastqs, sample_type, univ_options, bwa_options): """ - This module aligns the SAMPLE_TYPE dna fastqs to the reference - - ARGUMENTS -- depicts the sample type. Substitute with 'tumor'/'normal' - 1. fastqs: Dict of list of input WGS/WXS fastqs - fastqs - +- '_dna': [ , ] - 2. sample_type: string of 'tumor_dna' or 'normal_dna' - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 4. bwa_options: Dict of parameters specific to bwa - bwa_options - |- 'index': - +- 'n': - - RETURN VALUES - 1. output_files: Dict of aligned bam + reference (nested return) - output_files - |- '_fix_pg_sorted.bam': - +- '_fix_pg_sorted.bam.bai': - - This module corresponds to nodes 3 and 4 on the tree + Align a pair of fastqs with bwa. + + :param list fastqs: The input fastqs for alignment + :param str sample_type: Description of the sample to inject into the filename + :param dict univ_options: Dict of universal options used by almost all tools + :param dict bwa_options: Options specific to bwa + :return: fsID for the generated sam + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running bwa on %s:%s' % (univ_options['patient'], sample_type)) work_dir = os.getcwd() @@ -126,16 +123,14 @@ def run_bwa(job, fastqs, sample_type, univ_options, bwa_options): def bam_conversion(job, samfile, sample_type, univ_options, samtools_options): """ - This module converts SAMFILE from sam to bam - - ARGUMENTS - 1. samfile: - 2. sample_type: string of 'tumor_dna' or 'normal_dna' - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - RETURN VALUES - 1. output_files: REFER output_files in run_bwa() + Convert a sam to a bam. + + :param dict samfile: The input sam file + :param str sample_type: Description of the sample to inject into the filename + :param dict univ_options: Dict of universal options used by almost all tools + :param dict samtools_options: Options specific to samtools + :return: fsID for the generated bam + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running sam2bam on %s:%s' % (univ_options['patient'], sample_type)) work_dir = os.getcwd() @@ -159,16 +154,15 @@ def bam_conversion(job, samfile, sample_type, univ_options, samtools_options): def fix_bam_header(job, bamfile, sample_type, univ_options, samtools_options): """ - This module modified the header in BAMFILE - - ARGUMENTS - 1. bamfile: - 2. sample_type: string of 'tumor_dna' or 'normal_dna' - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - RETURN VALUES - 1. output_files: REFER output_files in run_bwa() + Fix the bam header to remove the command line call. Failing to do this causes Picard to reject + the bam. + + :param dict bamfile: The input bam file + :param str sample_type: Description of the sample to inject into the filename + :param dict univ_options: Dict of universal options used by almost all tools + :param dict samtools_options: Options specific to samtools + :return: fsID for the output bam + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running reheader on %s:%s' % (univ_options['patient'], sample_type)) work_dir = os.getcwd() @@ -203,16 +197,14 @@ def fix_bam_header(job, bamfile, sample_type, univ_options, samtools_options): def add_readgroups(job, bamfile, sample_type, univ_options, picard_options): """ - This module adds the appropriate read groups to the bam file - ARGUMENTS - 1. bamfile: - 2. sample_type: string of 'tumor_dna' or 'normal_dna' - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - |- 'dockerhub': - +- 'java_Xmx': value for max heap passed to java - RETURN VALUES - 1. output_files: REFER output_files in run_bwa() + Add read groups to the bam. + + :param dict bamfile: The input bam file + :param str sample_type: Description of the sample to inject into the filename + :param dict univ_options: Dict of universal options used by almost all tools + :param dict picard_options: Options specific to picard + :return: fsID for the output bam + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running add_read_groups on %s:%s' % (univ_options['patient'], sample_type)) @@ -231,7 +223,7 @@ def add_readgroups(job, bamfile, sample_type, univ_options, picard_options): 'PU=12345', ''.join(['SM=', sample_type.rstrip('_dna')])] docker_call(tool='picard', tool_parameters=parameters, work_dir=work_dir, - dockerhub=univ_options['dockerhub'], java_opts=univ_options['java_Xmx'], + dockerhub=univ_options['dockerhub'], java_xmx=univ_options['java_Xmx'], tool_version=picard_options['version']) output_file = job.fileStore.writeGlobalFile( '/'.join([work_dir, sample_type + '_aligned_fixpg_sorted_reheader.bam'])) diff --git a/src/protect/alignment/rna.py b/src/protect/alignment/rna.py index eeab3880..fed2cc83 100644 --- a/src/protect/alignment/rna.py +++ b/src/protect/alignment/rna.py @@ -15,8 +15,14 @@ from __future__ import absolute_import, print_function from collections import defaultdict from math import ceil + from protect.alignment.common import index_bamfile, index_disk -from protect.common import docker_call, get_files_from_filestore, is_gzipfile, untargz, docker_path +from protect.common import (docker_call, + docker_path, + export_results, + get_files_from_filestore, + is_gzipfile, + untargz) from toil.job import PromisedRequirement import os @@ -31,7 +37,13 @@ def star_disk(rna_fastqs, star_tar): def align_rna(job, fastqs, univ_options, star_options): """ - This is a convenience function that runs the entire rna alignment subgraph + A wrapper for the entire rna alignment subgraph. + + :param list fastqs: The input fastqs for alignment + :param dict univ_options: Dict of universal options used by almost all tools + :param dict star_options: Options specific to star + :return: Dict containing input bam and the generated index (.bam.bai) + :rtype: dict """ star = job.wrapJobFn(run_star, fastqs, univ_options, star_options, cores=star_options['n'], @@ -47,26 +59,17 @@ def align_rna(job, fastqs, univ_options, star_options): def run_star(job, fastqs, univ_options, star_options): """ - This module uses STAR to align the RNA fastqs to the reference - - ARGUMENTS - 1. fastqs: REFER RETURN VALUE of run_cutadapt() - 2. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 3. star_options: Dict of parameters specific to STAR - star_options - |- 'index': - +- 'n': - RETURN VALUES - 1. output_files: Dict of aligned bams - output_files - |- 'rnaAligned.toTranscriptome.out.bam': - +- 'rnaAligned.sortedByCoord.out.bam': Dict of genome bam + bai - |- 'rna_fix_pg_sorted.bam': - +- 'rna_fix_pg_sorted.bam.bai': + Align a pair of fastqs with STAR. - This module corresponds to node 9 on the tree + :param list fastqs: The input fastqs for alignment + :param dict univ_options: Dict of universal options used by almost all tools + :param dict star_options: Options specific to star + :return: Dict containing output genome bam, genome bai, and transcriptome bam + output_files: + |- 'rnaAligned.toTranscriptome.out.bam': fsID + +- 'rnaAligned.sortedByCoord.out.bam': + +- 'rna_fix_pg_sorted.bam': fsID + :rtype: dict """ assert star_options['type'] in ('star', 'starlong') job.fileStore.logToMaster('Running STAR on %s' % univ_options['patient']) @@ -110,13 +113,27 @@ def run_star(job, fastqs, univ_options, star_options): 'rnaAligned.sortedByCoord.out.bam']: output_files[bam_file] = job.fileStore.writeGlobalFile('/'.join([ work_dir, bam_file])) + export_results(job, output_files['rnaAligned.toTranscriptome.out.bam'], 'rna_transcriptome.bam', + univ_options, subfolder='alignments') return output_files def index_star(job, star_bams, univ_options, star_options): """ - This is a wrapper functiion for index_bamfile in protect.common which is required since run_star + A wrapper for indexing the genomic star bam generated by run_star. It is required since run_star returns a dict of 2 bams + + :param dict star_bams: The bams from run_star + :param dict univ_options: Dict of universal options used by almost all tools + :param dict star_options: Options specific to star + :return: Dict containing input bam and the generated index (.bam.bai) + output_files: + |- 'rnaAligned.toTranscriptome.out.bam': fsID + +- 'rnaAligned.sortedByCoord.out.bam': + |- 'rna_fix_pg_sorted.bam': fsID + +- 'rna_fix_pg_sorted.bam.bai': fsID + + :rtype: dict """ index = job.wrapJobFn(index_bamfile, star_bams['rnaAligned.sortedByCoord.out.bam'], 'rna', univ_options, samtools_options=star_options['samtools'], diff --git a/src/protect/binding_prediction/__init__.py b/src/protect/binding_prediction/__init__.py index c592ef89..6656c66f 100644 --- a/src/protect/binding_prediction/__init__.py +++ b/src/protect/binding_prediction/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/src/protect/binding_prediction/common.py b/src/protect/binding_prediction/common.py index 308d4cdd..1002d7f2 100644 --- a/src/protect/binding_prediction/common.py +++ b/src/protect/binding_prediction/common.py @@ -13,62 +13,49 @@ # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import, print_function - -import json -import os -import re from collections import defaultdict -import pandas from protect.binding_prediction.mhci import predict_mhci_binding from protect.binding_prediction.mhcii import predict_mhcii_binding, predict_netmhcii_binding -from protect.common import get_files_from_filestore, untargz, export_results, read_peptide_file +from protect.common import get_files_from_filestore, export_results, read_peptide_file, untargz + +import json +import os +import pandas +import re def spawn_antigen_predictors(job, transgened_files, phlat_files, univ_options, mhc_options): """ - Based on the number of alleles obtained from node 14, this module will spawn callers to predict - MHCI:peptide and MHCII:peptide binding on the peptide files from node 17. Once all MHC:peptide - predictions are made, merge them via a follow-on job. - - ARGUMENTS - 1. transgened_files: REFER RETURN VALUE of run_transgene() - 2. phlat_files: REFER RETURN VALUE of merge_phlat_calls() - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 4. mhc_options: Dict of dicts of parameters specific to mhci and mhcii - respectively - mhc_options - |- 'mhci' - | |- 'method_file': - | +- 'pred': String describing prediction method to use - +- 'mhcii' - |- 'method_file': - +- 'pred': String describing prediction method to use - - RETURN VALUES - 1. tuple of (mhci_preds, mhcii_preds) - mhci_preds: Dict of return value from running predictions on a given - mhc for all peptides of length 9 and 10. - mhci_preds - |- _9_mer.faa: - |- _10_mer.faa: - | - .. - +- _10_mer.faa: - mhcii_preds: Dict of return value from running predictions on a given - mhc for all peptides of length 15. - mhci_preds - |- _15_mer.faa: - | - .. - +- _15_mer.faa: - - This module corresponds to node 18 on the tree + Spawn a job to predict MHCI:peptide and MHCII:peptide binding on the input peptide file for each + allele in in the input haplotype (phlat) files. + + :param dict transgened_files: Dict of tumor and normal peptide fsIDs and the tumor .map fsIDs + :param dict phlat_files: Dict of MHCI and MHCII haplotypes + :param dict univ_options: Dict of universal options used by almost all tools + :param tuple mhc_options: Options specific to mhci and mhcii binding predictions + :return: Tuple of dicts of mhci and mhcii predictions + (mhci_preds, mhcii_preds) + | |- : + | | |- 'tumor': fsID + | | +- 'normal': (fsID, str) + | | + | |- ... + | | + | +- : + | |- 'tumor': fsID + | +- 'normal': (fsID, str) + | + |- : + | |- 'tumor': fsID + | +- 'normal': fsID + | + |- ... + | + +- : + |- 'tumor': fsID + +- 'normal': fsID + :rtype: tuple(dict, dict) """ job.fileStore.logToMaster('Running spawn_anti on %s' % univ_options['patient']) work_dir = os.getcwd() @@ -151,9 +138,9 @@ def spawn_antigen_predictors(job, transgened_files, phlat_files, univ_options, m def read_fastas(input_files): """ - Reads the tumor and normal fastas to disk + Read the tumor and normal fastas into a joint dict. - :param dict input_files: A dict containing filename:filepath for T_ and N_ transgened files. + :param dict input_files: A dict containing filename: filepath for T_ and N_ transgened files. :return: The read fastas in a dictionary of tuples :rtype: dict """ @@ -169,7 +156,7 @@ def read_fastas(input_files): def _read_fasta(fasta_file, output_dict): """ - Actually read the peptide fasta. + Read the peptide fasta into an existing dict. :param str fasta_file: The peptide file :param dict output_dict: The dict to appends results to. @@ -192,8 +179,8 @@ def _read_fasta(fasta_file, output_dict): def _process_consensus_mhcii(mhc_file, normal=False): """ - Process the results from running IEDB binding predictions using the consensus method into a - pandas dataframe. + Process the results from running IEDB MHCII binding predictions using the consensus method into + a pandas dataframe. :param str mhc_file: Output file containing consensus mhcii:peptide binding predictions :param bool normal: Is this processing the results of a normal? @@ -235,8 +222,8 @@ def _process_consensus_mhcii(mhc_file, normal=False): def _process_sturniolo_mhcii(mhc_file, normal=False): """ - Process the results from running IEDB binding predictions using the sturniolo method into a - pandas dataframe. + Process the results from running IEDB MHCII binding predictions using the sturniolo method into + a pandas dataframe. :param str mhc_file: Output file containing sturniolo mhcii:peptide binding predictions :param bool normal: Is this processing the results of a normal? @@ -292,7 +279,7 @@ def _process_net_mhcii(mhc_file, normal=False): def _process_mhci(mhc_file, normal=False): """ - Process the results from running IEDB mhci binding predictions into a pandas dataframe + Process the results from running IEDB MHCI binding predictions into a pandas dataframe. :param str mhc_file: Output file containing netmhciipan mhci:peptide binding predictions :param bool normal: Is this processing the results of a normal? @@ -354,28 +341,18 @@ def predict_normal_binding(job, binding_result, transgened_files, allele, peplen Predict the binding score for the normal counterparts of the peptides in mhc_dict and then return the results in a properly formatted structure. - :param job: Job :param str binding_result: The results from running predict_mhci_binding or predict_mhcii_binding on a single allele :param dict transgened_files: A dictionary containing the jobstore IDs for "T__mer.faa" and "N__mer.faa" :param str allele: The allele to get binding for :param str peplen: The peptide length - :param dict univ_options: Universal options for ProTECT - univ_options - +- 'dockerhub': - :param dict mhc_options: MHC specific options - mhc_options - |- 'mhci' - | |- 'method_file': - | +- 'pred': String describing prediction method to use - +- 'mhcii' - |- 'method_file': - +- 'pred': String describing prediction method to use + :param dict univ_options: Dict of universal options used by almost all tools + :param dict mhc_options: Options specific to mhci or mhcii binding predictions :return: A fully filled out mhc_dict with normal information + output_dict: + |- 'tumor': fsID + +- 'normal': fsID or (fsID, str) -- Depending on MHCI or MHCII :rtype: dict """ job.fileStore.logToMaster('Running predict_normal_binding on %s for allele %s and length %s' % @@ -454,15 +431,15 @@ def predict_normal_binding(job, binding_result, transgened_files, allele, peplen def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files, univ_options): """ - Merge all the calls from nodes 18 and 19. - - This module corresponds to node 19 on the tree + Merge all the calls generated by spawn_antigen_predictors. - :param job: Job :param dict antigen_predictions: The return value from running :meth:`spawn_antigen_predictors` :param dict transgened_files: The transgened peptide files :param dict univ_options: Universal options for ProTECT :return: merged binding predictions + output_files: + |- 'mhcii_merged_files.list': fsID + +- 'mhci_merged_files.list': fsID :rtype: dict """ job.fileStore.logToMaster('Merging MHC calls') @@ -542,24 +519,16 @@ def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files, univ_opt def print_mhc_peptide(neoepitope_info, peptides, pepmap, outfile, netmhc=False): """ - To reduce code redundancy, this module will accept data from merge_mhc_peptide_calls for a given - neoepitope and print it to outfile - ARGUMENTS - 1. neoepitope_info: pandas.core.frame with allele, pept, pred, core, normal_pept, normal_pred - 2. peptides: Dict of all IARS considered - peptides - |- 'neoepitope_1': - .. - |- 'neoepitope_n': - 3. pepmap: Info correlating neoepitope with the gene and transcript level - mutations. - peptides - |- 'neoepitope_1': - | 'ensembl_gene\thugo_gene\tcomma_sep_transcript_mutations' - .. - +- 'neoepitope_n': - 'ensembl_gene\thugo_gene\tcomma_sep_transcript_mutations' - + Accept data about one neoepitope from merge_mhc_peptide_calls and print it to outfile. This is + a generic module to reduce code redundancy. + + :param pandas.core.frame neoepitope_info: object containing with allele, pept, pred, core, + normal_pept, normal_pred + :param dict peptides: Dict of pepname: pep sequence for all IARS considered + :param dict pepmap: Dict containing teh contents from the peptide map file. + :param file outfile: An open file descriptor to the output file + :param bool netmhc: Does this record correspond to a netmhcIIpan record? These are processed + differently. """ if netmhc: peptide_names = [neoepitope_info.peptide_name] diff --git a/src/protect/binding_prediction/mhci.py b/src/protect/binding_prediction/mhci.py index 148c9797..7d1c84c8 100644 --- a/src/protect/binding_prediction/mhci.py +++ b/src/protect/binding_prediction/mhci.py @@ -14,18 +14,23 @@ # limitations under the License. from __future__ import absolute_import, print_function -import os - from protect.common import docker_call, get_files_from_filestore, read_peptide_file +import os + -def predict_mhci_binding(job, peptfile, allele, peplen, univ_options, - mhci_options): +def predict_mhci_binding(job, peptfile, allele, peplen, univ_options, mhci_options): """ - This module will predict MHC:peptide binding for peptides in the files created in node XX to - ALLELE. ALLELE represents an MHCI allele. + Predict binding for each peptide in `peptfile` to `allele` using the IEDB mhci binding + prediction tool. - This module corresponds to node 18 on the tree + :param toil.fileStore.FileID peptfile: The input peptide fasta + :param str allele: Allele to predict binding against + :param str peplen: Length of peptides to process + :param dict univ_options: Dict of universal options used by almost all tools + :param dict mhci_options: Options specific to mhci binding prediction + :return: fsID for file containing the predictions + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running mhci on %s:%s:%s' % (univ_options['patient'], allele, peplen)) diff --git a/src/protect/binding_prediction/mhcii.py b/src/protect/binding_prediction/mhcii.py index 874095ba..07498fdf 100644 --- a/src/protect/binding_prediction/mhcii.py +++ b/src/protect/binding_prediction/mhcii.py @@ -22,13 +22,15 @@ def predict_mhcii_binding(job, peptfile, allele, univ_options, mhcii_options): """ - This module will predict MHC:peptide binding for peptides in the files created in node YY to - ALLELE. ALLELE represents an MHCII allele. + Predict binding for each peptide in `peptfile` to `allele` using the IEDB mhcii binding + prediction tool. - The module returns (PREDFILE, PREDICTOR) where PREDFILE contains the predictions and PREDICTOR - is the predictor used (Consensus, NetMHCIIpan, or Sturniolo). - - This module corresponds to node 19 on the tree + :param toil.fileStore.FileID peptfile: The input peptide fasta + :param str allele: Allele to predict binding against + :param dict univ_options: Dict of universal options used by almost all tools + :param dict mhcii_options: Options specific to mhcii binding prediction + :return: tuple of fsID for file containing the predictions and the predictor used + :rtype: tuple(toil.fileStore.FileID, str) """ job.fileStore.logToMaster('Running mhcii on %s:%s' % (univ_options['patient'], allele)) work_dir = os.getcwd() @@ -72,10 +74,14 @@ def predict_mhcii_binding(job, peptfile, allele, univ_options, mhcii_options): def predict_netmhcii_binding(job, peptfile, allele, univ_options, netmhciipan_options): """ - This module will predict MHC:peptide binding for peptides in the files created in node YY to - ALLELE. ALLELE represents an MHCII allele. + Predict binding for each peptide in `peptfile` to `allele` using netMHCIIpan. - This module corresponds to node 19 on the tree + :param toil.fileStore.FileID peptfile: The input peptide fasta + :param str allele: Allele to predict binding against + :param dict univ_options: Dict of universal options used by almost all tools + :param dict netmhciipan_options: Options specific to netmhciipan binding prediction + :return: tuple of fsID for file containing the predictions and the predictor used (netMHCIIpan) + :rtype: tuple(toil.fileStore.FileID, str) """ job.fileStore.logToMaster('Running netmhciipan on %s' % allele) work_dir = os.getcwd() diff --git a/src/protect/common.py b/src/protect/common.py index 3a303978..a2548501 100644 --- a/src/protect/common.py +++ b/src/protect/common.py @@ -28,29 +28,23 @@ import errno import gzip import os -import re -import shutil import subprocess import sys import tarfile -import time import urllib2 import uuid def get_files_from_filestore(job, files, work_dir, docker=False): """ - This is adapted from John Vivian's return_input_paths from the RNA-Seq pipeline. + Download a dict of files to the given directory and modify the path to a docker-friendly one if + requested. - Returns the paths of files from the FileStore if they are not present. - If docker=True, return the docker path for the file. - If the file extension is tar.gz, then tar -zxvf it. - - files is a dict with: - keys = the name of the file to be returned in toil space - value = the input value for the file (can be toil temp file) - work_dir is the location where the file should be stored - cache indiciates whether caching should be used + :param dict files: A dictionary of filenames: fsIDs + :param str work_dir: The destination directory + :param bool docker: Should the file path be converted to our standard docker '/data/filename'? + :return: Dict of files: (optionallly docker-friendly) fileepaths + :rtype: dict """ for name in files.keys(): outfile = job.fileStore.readGlobalFile(files[name], '/'.join([work_dir, name])) @@ -65,17 +59,29 @@ def get_files_from_filestore(job, files, work_dir, docker=False): def docker_path(filepath): """ - Given a path, returns that files path inside the docker mount directory - (/data). + Given a path, return that files path inside the docker mount directory (/data). + + :param str filepath: The path to a file + :return: The docker-friendly path for `filepath` + :rtype: str """ return os.path.join('/data', os.path.basename(filepath)) -def docker_call(tool, tool_parameters, work_dir, java_opts=None, outfile=None, +def docker_call(tool, tool_parameters, work_dir, java_xmx=None, outfile=None, dockerhub='aarjunrao', interactive=False, tool_version='latest'): """ - Makes subprocess call of a command to a docker container. work_dir MUST BE AN ABSOLUTE PATH or - the call will fail. outfile is an open file descriptor to a writeable file. + Make a subprocess call of a command to a docker container. + + :param str tool: The tool to run + :param list tool_parameters: Parameters passed to `tool` + :param str work_dir: The absolute path to the working directory to be mounted into the container + :param str java_xmx: The heap space in human readable format to provide java ('20G' will pass + -Xmx20G to java) + :param file outfile: The file object to dump stdout + :param str dockerhub: The dockerhub from where the tool will be pulled + :param bool interactive: Should the docker container be run in interactive mode? + :param str tool_version: What dockerised tool version should be used? """ # If an outifle has been provided, then ensure that it is of type file, it is writeable, and # that it is open. @@ -106,8 +112,8 @@ def docker_call(tool, tool_parameters, work_dir, java_opts=None, outfile=None, raise RuntimeError('docker not found on system. Install on all' + ' nodes.') # If java options have been provided, it needs to be in the docker call - if java_opts: - base_docker_call = ' docker run -e JAVA_OPTS=-Xmx{} '.format(java_opts) + '--rm=true ' + \ + if java_xmx: + base_docker_call = ' docker run -e JAVA_OPTS=-Xmx{} '.format(java_xmx) + '--rm=true ' + \ '-v {}:/data --log-driver=none '.format(work_dir) + interactive else: base_docker_call = ' docker run --rm=true -v {}:/data '.format(work_dir) + \ @@ -124,12 +130,14 @@ def docker_call(tool, tool_parameters, work_dir, java_opts=None, outfile=None, def untargz(input_targz_file, untar_to_dir): """ - This module accepts a tar.gz archive and untars it. + Accept a tar.gz archive and untar it to the given location. The archive can have either one + file, or many files in a single directory. - RETURN VALUE: path to the untar-ed directory/file + :param str input_targz_file: Path to a tar.gz archive + :param str untar_to_dir: The directory where untared files will be dumped - NOTE: this module expects the multiple files to be in a directory before - being tar-ed. + :return: path to the untar-ed directory/file + :rtype: str """ assert tarfile.is_tarfile(input_targz_file), 'Not a tar file.' tarball = tarfile.open(input_targz_file) @@ -142,8 +150,10 @@ def untargz(input_targz_file, untar_to_dir): def gunzip(input_gzip_file, block_size=1024): """ Gunzips the input file to the same directory + :param input_gzip_file: File to be gunzipped :return: path to the gunzipped file + :rtype: str """ assert os.path.splitext(input_gzip_file)[1] == '.gz' assert is_gzipfile(input_gzip_file) @@ -160,10 +170,15 @@ def gunzip(input_gzip_file, block_size=1024): def is_gzipfile(filename): """ - This function attempts to ascertain the gzip status of a file based on the "magic signatures" of - the file. This was taken from the stack overflow + Attempt to ascertain the gzip status of a file based on the "magic signatures" of the file. + + This was taken from the stack overflow post http://stackoverflow.com/questions/13044562/python-mechanism-to-identify-compressed-file-type\ -and-uncompress + + :param str filename: A path to a file + :return: True if the file appears to be gzipped else false + :rtype: bool """ assert os.path.exists(filename), 'Input {} does not '.format(filename) + \ 'point to a file.' @@ -178,21 +193,24 @@ def is_gzipfile(filename): def get_file_from_s3(job, s3_url, encryption_key=None, per_file_encryption=True, write_to_jobstore=True): """ - Downloads a supplied URL that points to an unencrypted, unprotected file on Amazon S3. The file - is downloaded and a subsequently written to the jobstore and the return value is a the path to - the file in the jobstore. + Download a supplied URL that points to a file on Amazon S3. If the file is encrypted using + sse-c (with the user-provided key or with a hash of the usesr provided master key) then the + encryption keys will be used when downloading. The file is downloaded and written to the + jobstore if requested. - :param str s3_url: URL for the file (can be s3 or https) + :param str s3_url: URL for the file (can be s3, S3 or https) :param str encryption_key: Path to the master key :param bool per_file_encryption: If encrypted, was the file encrypted using the per-file method? :param bool write_to_jobstore: Should the file be written to the job store? + :return: Path to the downloaded file or fsID (if write_to_jobstore was True) + :rtype: str|toil.fileStore.FileID """ work_dir = job.fileStore.getLocalTempDir() parsed_url = urlparse(s3_url) if parsed_url.scheme == 'https': download_url = 'S3:/' + parsed_url.path # path contains the second / - elif parsed_url.scheme in ('s3','S3'): + elif parsed_url.scheme in ('s3', 'S3'): download_url = s3_url else: raise RuntimeError('Unexpected url scheme: %s' % s3_url) @@ -261,14 +279,18 @@ def get_file_from_s3(job, s3_url, encryption_key=None, per_file_encryption=True, def get_file_from_url(job, any_url, encryption_key=None, per_file_encryption=True, write_to_jobstore=True): """ - Downloads a supplied URL (ftp,http,https) that points to a file. The file - is downloaded and a subsequently written to the jobstore and the return value is a the path to - the file in the jobstore. If URL link is invalid the function will raise an error. + Download a supplied URL that points to a file on an http, https or ftp server. If the file is + found to be an https s3 link then the file is downloaded using `get_file_from_s3`. The file is + downloaded and written to the jobstore if requested. + Encryption arguments are for passing to `get_file_from_s3` if required. :param str any_url: URL for the file + :param str encryption_key: Path to the master key + :param bool per_file_encryption: If encrypted, was the file encrypted using the per-file method? :param bool write_to_jobstore: Should the file be written to the job store? + :return: Path to the downloaded file or fsID (if write_to_jobstore was True) + :rtype: str|toil.fileStore.FileID """ - work_dir = job.fileStore.getLocalTempDir() filename = '/'.join([work_dir, str(uuid.uuid4())]) @@ -276,9 +298,8 @@ def get_file_from_url(job, any_url, encryption_key=None, per_file_encryption=Tru parsed_url = urlparse(any_url) try: response = urllib2.urlopen(url) - except urllib2.HTTPError: - if (parsed_url.netloc).startswith(('s3', 'S3')): + if parsed_url.netloc.startswith(('s3', 'S3')): job.fileStore.logToMaster("Detected https link is for an encrypted s3 file.") return get_file_from_s3(job, any_url, encryption_key=encryption_key, per_file_encryption=per_file_encryption, @@ -296,14 +317,13 @@ def get_file_from_url(job, any_url, encryption_key=None, per_file_encryption=Tru def bam2fastq(bamfile, univ_options, picard_options): """ - split an input bam to paired fastqs. - - ARGUMENTS - 1. bamfile: Path to a bam file - 2. univ_options: Dict of universal arguments used by almost all tools - univ_options - |- 'dockerhub': - +- 'java_Xmx': value for max heap passed to java + Split an input bam to paired fastqs. + + :param str bamfile: Path to a bam file + :param dict univ_options: Dict of universal options used by almost all tools + :param dict picard_options: Dict of options specific to Picard + :return: Path to the _1.fastq file + :rtype: str """ work_dir = os.path.split(bamfile)[0] base_name = os.path.split(os.path.splitext(bamfile)[0])[1] @@ -313,31 +333,27 @@ def bam2fastq(bamfile, univ_options, picard_options): ''.join(['F2=/data/', base_name, '_2.fastq']), ''.join(['FU=/data/', base_name, '_UP.fastq'])] docker_call(tool='picard', tool_parameters=parameters, work_dir=work_dir, - dockerhub=univ_options['dockerhub'], java_opts=univ_options['java_Xmx'], + dockerhub=univ_options['dockerhub'], java_xmx=univ_options['java_Xmx'], tool_version=picard_options['version']) first_fastq = ''.join([work_dir, '/', base_name, '_1.fastq']) assert os.path.exists(first_fastq) return first_fastq -def export_results(job, fsid, file_path, univ_options, subfolder=None): +def export_results(job, fsid, file_name, univ_options, subfolder=None): """ Write out a file to a given location. The location can be either a directory on the local machine, or a folder with a bucket on AWS. - TODO: Azure support + :param str fsid: The file store id for the file to be exported - :param str file_path: The path to the file that neeeds to be exported - :param dict univ_options: A dict of the universal options passed to this script. The important - dict entries are ouput_folder and storage_location. - * storage_location: 'Local' or an 'aws:'. - * output_folder: The folder to store the file. This must exist on the local machine - if storage_location is 'Local'. If the storage_location is an aws - bucket, this string represents the path to the file in the bucket. - To keep it in the base directory, specify 'NA'. + :param str file_name: The name of the file that neeeds to be exported (path to file is also + acceptable) + :param dict univ_options: Dict of universal options used by almost all tools :param str subfolder: A sub folder within the main folder where this data should go :return: None """ job.fileStore.logToMaster('Exporting %s to output location' % fsid) + file_name = os.path.basename(file_name) try: assert univ_options['output_folder'], 'Need a path to a folder to write out files' assert univ_options['storage_location'], 'Need to know where the files need to go. ' + \ @@ -361,75 +377,24 @@ def export_results(job, fsid, file_path, univ_options, subfolder=None): except OSError as err: if err.errno != errno.EEXIST: raise - output_url = 'file://' + os.path.join(output_folder, os.path.basename(file_path)) + output_url = 'file://' + os.path.join(output_folder, file_name) elif univ_options['storage_location'].startswith('aws'): # Handle AWS bucket_name = univ_options['storage_location'].split(':')[-1] - - file_name = os.path.basename(file_path) output_url = os.path.join('S3://', bucket_name, output_folder.strip('/'), file_name) # Can't do Azure or google yet. else: + # TODO: Azure support print("Currently doesn't support anything but Local and aws.") return job.fileStore.exportFile(fsid, output_url) -def write_to_s3(file_path, key_path, bucket_name, output_folder, overwrite=True): - """ - Write the file to S3. - - :param file_path: The file to be written - :param key_path: Path to the encryption Key - :param bucket_name: The bucket where the data will be written - :param output_folder: The location in the bucket for the output data - :param overwrite: Should the data be overwritten if it exists in the bucket? - """ - assert overwrite in (True, False) - overwrite = 'overwrite' if overwrite else 'skip' - file_name = os.path.basename(file_path) - output_file = os.path.join('S3://', bucket_name, output_folder.strip('/'), file_name) - # Adding resume doesn't affect a new upload, but protects in failed upload cases. - subprocess.check_call(['s3am', 'upload', '--exists=' + overwrite, '--resume', - '--sse-key-file', key_path, file_path, output_file]) - - -def file_xext(filepath): - """ - Get the file extension wrt compression from the filename (is it tar or targz) - :param str filepath: Path to the file - :return str ext: Compression extension name - """ - ext = os.path.splitext(filepath)[1] - if ext == '.gz': - xext = os.path.splitext(os.path.splitext(filepath)[0])[1] - if xext == '.tar': - ext = xext + ext - elif ext == '.tar': - pass # ext is already .tar - else: - ext = '' - return ext - - -def strip_xext(filepath): - """ - Strips the compression extension from the filename - :param filepath: Path to compressed file. - :return str filepath: Path to the file with the compression extension stripped off. - """ - ext_size = len(file_xext(filepath).split('.')) - 1 - for i in xrange(0, ext_size): - filepath = os.path.splitext(filepath)[0] - return filepath - - def delete_fastqs(job, fastqs): """ - This module will delete the fastqs from the job Store once their purpose has been achieved (i.e. - after all mapping steps) + This module will delete the fastqs from the job Store once their purpose has been achieved + (i.e. after all mapping steps) - ARGUMENTS :param dict fastqs: Dict of list of input fastqs """ for key in fastqs.keys(): @@ -451,12 +416,14 @@ class ParameterError(Exception): def read_peptide_file(in_peptfile): """ - This module reads an input peptide fasta file into memory in the form of a dict with - key = fasta record name - value = corresponding peptide sequence + Reads an input peptide fasta file into memory in the form of a dict of fasta record: sequence + + :param str in_peptfile: Path to a peptide fasta + :return: Dict of fasta record: sequence + :rtype: dict """ peptides = defaultdict() - pept=None + pept = None with open(in_peptfile, 'r') as peptfile: for line in peptfile: if line.startswith('>'): diff --git a/src/protect/expression_profiling/__init__.py b/src/protect/expression_profiling/__init__.py index c592ef89..6656c66f 100644 --- a/src/protect/expression_profiling/__init__.py +++ b/src/protect/expression_profiling/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/src/protect/expression_profiling/rsem.py b/src/protect/expression_profiling/rsem.py index b2ce9279..7e3b6511 100644 --- a/src/protect/expression_profiling/rsem.py +++ b/src/protect/expression_profiling/rsem.py @@ -14,8 +14,12 @@ # limitations under the License. from __future__ import print_function from math import ceil -from protect.common import (docker_call, get_files_from_filestore, untargz, docker_path, - export_results) + +from protect.common import (docker_call, + docker_path, + export_results, + get_files_from_filestore, + untargz) from toil.job import PromisedRequirement import os @@ -30,13 +34,16 @@ def rsem_disk(star_bams, rsem_index): def wrap_rsem(job, star_bams, univ_options, rsem_options): """ - This is a convenience function that runs rsem from the star outputs. + A wrapper for run_rsem using the results from run_star as input. - :param job job: job :param dict star_bams: dict of results from star - :param dict univ_options: Universal Options + :param dict univ_options: Dict of universal options used by almost all tools :param dict rsem_options: Options specific to rsem - :return: + :return: Dict of gene- and isoform-level expression calls + output_files: + |- 'rsem.genes.results': fsID + +- 'rsem.isoforms.results': fsID + :rtype: dict """ rsem = job.addChildJobFn(run_rsem, star_bams['rnaAligned.toTranscriptome.out.bam'], univ_options, rsem_options, cores=rsem_options['n'], @@ -48,22 +55,17 @@ def wrap_rsem(job, star_bams, univ_options, rsem_options): def run_rsem(job, rna_bam, univ_options, rsem_options): """ - This module will run rsem on the RNA Bam file. + Run rsem on the input RNA bam. ARGUMENTS - 1. rna_bam: - 2. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 3. rsem_options: Dict of parameters specific to rsem - rsem_options - |- 'index': - +- 'n': - - RETURN VALUES - 1. output_file: - - This module corresponds to node 9 on the tree + :param toil.fileStore.FileID rna_bam: fsID of a transcriptome bam generated by STAR + :param dict univ_options: Dict of universal options used by almost all tools + :param dict rsem_options: Options specific to rsem + :return: Dict of gene- and isoform-level expression calls + output_files: + |- 'rsem.genes.results': fsID + +- 'rsem.isoforms.results': fsID + :rtype: dict """ job.fileStore.logToMaster('Running rsem on %s' % univ_options['patient']) work_dir = os.getcwd() diff --git a/src/protect/haplotyping/__init__.py b/src/protect/haplotyping/__init__.py index c592ef89..6656c66f 100644 --- a/src/protect/haplotyping/__init__.py +++ b/src/protect/haplotyping/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/src/protect/haplotyping/phlat.py b/src/protect/haplotyping/phlat.py index 151361ee..1979ceaf 100644 --- a/src/protect/haplotyping/phlat.py +++ b/src/protect/haplotyping/phlat.py @@ -15,12 +15,16 @@ from __future__ import absolute_import, print_function from collections import defaultdict from math import ceil -from protect.common import (docker_call, get_files_from_filestore, is_gzipfile, untargz, - docker_path, export_results) + +from protect.common import (docker_call, + docker_path, + export_results, + get_files_from_filestore, + is_gzipfile, + untargz) import os import re -import sys # disk for phlat @@ -31,26 +35,14 @@ def phlat_disk(rna_fastqs): def run_phlat(job, fastqs, sample_type, univ_options, phlat_options): """ - This module will run PHLAT on SAMPLE_TYPE fastqs. - - ARGUMENTS -- depicts the sample type. Substitute with 'tumor_dna', - 'normal_dna', or 'tumor_rna' - 1. fastqs: Dict of list of input WGS/WXS fastqs - fastqs - +- '': [ , ] - 2. sample_type: string of 'tumor' or 'normal' - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 4. phlat_options: Dict of parameters specific to phlat - phlat_options - |- 'index': - +- 'n': - - RETURN VALUES - 1. output_file: - - This module corresponds to nodes 5, 6 and 7 on the tree + Run PHLAT on a pair of input fastqs of type `sample_type`. + + :param list fastqs: List of input fastq files + :param str sample_type: Description of the sample type to inject into the file name. + :param dict univ_options: Dict of universal options used by almost all tools + :param dict phlat_options: Options specific to PHLAT + :return: fsID for the HLA haplotype called from teh input fastqs + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running phlat on %s:%s' % (univ_options['patient'], sample_type)) work_dir = os.getcwd() @@ -85,21 +77,17 @@ def run_phlat(job, fastqs, sample_type, univ_options, phlat_options): def merge_phlat_calls(job, tumor_phlat, normal_phlat, rna_phlat, univ_options): """ - This module will merge the results form running PHLAT on the 3 input fastq - pairs. - - ARGUMENTS - 1. tumor_phlat: - 2. normal_phlat: - 3. rna_phlat: + Merge tumor, normal and tumor rna Haplotypes into consensus calls. - RETURN VALUES - 1. output_files: Dict of JSids for consensus MHCI and MHCII alleles + :param toil.fileStore.FileID tumor_phlat: fsID for HLA haplotypes called from tumor DNA + :param toil.fileStore.FileID normal_phlat: fsID for HLA haplotypes called from normal DNA + :param toil.fileStore.FileID rna_phlat: fsID for HLA haplotypes called from tumor RNA + :param dict univ_options: Dict of universal options used by almost all tools + :return: Dict of fsIDs for consensus MHCI and MHCII alleles output_files - |- 'mhci_alleles.list': - +- 'mhcii_alleles.list': - - This module corresponds to node 14 on the tree + |- 'mhci_alleles.list': fsID + +- 'mhcii_alleles.list': fsID + :rtype: dict """ job.fileStore.logToMaster('Merging Phlat calls') work_dir = os.getcwd() @@ -140,9 +128,12 @@ def merge_phlat_calls(job, tumor_phlat, normal_phlat, rna_phlat, univ_options): def parse_phlat_file(phlatfile, mhc_alleles): """ - Parse the input phlat file to pull out the alleles it contains - :param phlatfile: Open file descriptor for a phlat output sum file - :param mhc_alleles: dictionary of alleles. + Parse an input phlat file to identify predicted HLA alleles. + + :param file phlatfile: Open file descriptor for a phlat output sum file + :param dict mhc_alleles: Dict of alleles. + :return: Updated dict of alleles + :rtype: dict """ for line in phlatfile: if line.startswith('Locus'): @@ -168,8 +159,11 @@ def parse_phlat_file(phlatfile, mhc_alleles): def most_probable_alleles(allele_list): """ - This module accepts a list of tuples of (allele, p_value) pairs. It returns the 2 most probable - alleles for that group. + Identify the most probable haplotype in a list + + :param list allele_list: List of tuples of (allele, p_value) + :return: List of 2 most probable alleles in the group + :rtype: list[str] """ all_alleles = defaultdict() # First collect all the keys. Make a dict with allele as key and list of pvalues as value diff --git a/src/protect/mutation_annotation/__init__.py b/src/protect/mutation_annotation/__init__.py index c592ef89..6656c66f 100644 --- a/src/protect/mutation_annotation/__init__.py +++ b/src/protect/mutation_annotation/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/src/protect/mutation_annotation/snpeff.py b/src/protect/mutation_annotation/snpeff.py index 151fdbd2..b00301ae 100644 --- a/src/protect/mutation_annotation/snpeff.py +++ b/src/protect/mutation_annotation/snpeff.py @@ -14,9 +14,12 @@ # limitations under the License. from __future__ import absolute_import, print_function from math import ceil -from protect.common import (docker_call, get_files_from_filestore, export_results, untargz, - docker_path) +from protect.common import (docker_call, + docker_path, + export_results, + get_files_from_filestore, + untargz) import os @@ -27,23 +30,13 @@ def snpeff_disk(snpeff_index): def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): """ - This module will run snpeff on the aggregated mutation calls. Currently the only mutations - called are SNPs hence SnpEff suffices. This node will be replaced in the future with another - translator. + Run snpeff on an input vcf. - ARGUMENTS - 1. merged_mutation_file: - 2. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 3. snpeff_options: Dict of parameters specific to snpeff - snpeff_options - +- 'index': - - RETURN VALUES - 1. output_file: - - This node corresponds to node 16 on the tree + :param toil.fileStore.FileID merged_mutation_file: fsID for input vcf + :param dict univ_options: Dict of universal options used by almost all tools + :param dict snpeff_options: Options specific to snpeff + :return: fsID for the snpeffed vcf + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running snpeff on %s' % univ_options['patient']) work_dir = os.getcwd() @@ -56,8 +49,8 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): parameters = ['eff', '-dataDir', input_files['snpeff_index'], - '-c', '/'.join([input_files['snpeff_index'], 'snpEff_' + univ_options['ref'] + - '_gencode.config']), + '-c', '/'.join([input_files['snpeff_index'], + 'snpEff_' + univ_options['ref'] + '_gencode.config']), '-no-intergenic', '-no-downstream', '-no-upstream', @@ -68,7 +61,7 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): xmx = snpeff_options['java_Xmx'] if snpeff_options['java_Xmx'] else univ_options['java_Xmx'] with open('/'.join([work_dir, 'mutations.vcf']), 'w') as snpeff_file: docker_call(tool='snpeff', tool_parameters=parameters, work_dir=work_dir, - dockerhub=univ_options['dockerhub'], java_opts=xmx, outfile=snpeff_file, + dockerhub=univ_options['dockerhub'], java_xmx=xmx, outfile=snpeff_file, tool_version=snpeff_options['version']) output_file = job.fileStore.writeGlobalFile(snpeff_file.name) export_results(job, output_file, snpeff_file.name, univ_options, subfolder='mutations/snpeffed') diff --git a/src/protect/mutation_calling/__init__.py b/src/protect/mutation_calling/__init__.py index c592ef89..6656c66f 100644 --- a/src/protect/mutation_calling/__init__.py +++ b/src/protect/mutation_calling/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/src/protect/mutation_calling/common.py b/src/protect/mutation_calling/common.py index 3b92098d..b1fedd38 100644 --- a/src/protect/mutation_calling/common.py +++ b/src/protect/mutation_calling/common.py @@ -14,6 +14,7 @@ # limitations under the License. from __future__ import absolute_import, print_function from collections import defaultdict + from protect.common import export_results, get_files_from_filestore, untargz import itertools @@ -22,11 +23,11 @@ def sample_chromosomes(job, genome_fai_file): """ - Get a list of chromosomes in the input data + Get a list of chromosomes in the input data. - :param job: job :param string genome_fai_file: Job store file ID for the genome fai file - :returns list: Chromosomes in the sample + :return: Chromosomes in the sample + :rtype: list[str] """ work_dir = os.getcwd() genome_fai = untargz(job.fileStore.readGlobalFile(genome_fai_file), work_dir) @@ -34,6 +35,13 @@ def sample_chromosomes(job, genome_fai_file): def chromosomes_from_fai(genome_fai): + """ + Read a fasta index (fai) file and parse the input chromosomes. + + :param str genome_fai: Path to the fai file. + :return: list of input chromosomes + :rtype: list[str] + """ chromosomes = [] with open(genome_fai) as fai_file: for line in fai_file: @@ -44,20 +52,17 @@ def chromosomes_from_fai(genome_fai): def run_mutation_aggregator(job, mutation_results, univ_options): """ - This module will aggregate all the mutations called in the previous steps + Aggregate all the called mutations. - :param job: job :param dict mutation_results: Dict of dicts of the various mutation callers in a per chromosome - format - :param dict univ_options: Universal Options - :param int numthreads: number of threads to use in the merge step - :returns: job store file id fo rthe merged mutations file - - This module corresponds to node 15 on the tree + format + :param dict univ_options: Dict of universal options used by almost all tools + :returns: fsID for the merged mutations file + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Aggregating mutations for %s' % univ_options['patient']) - # Setup an input data structure for the merge function + # Setup an input data structure for the merge function out = {} for chrom in mutation_results['mutect'].keys(): out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results, @@ -68,15 +73,14 @@ def run_mutation_aggregator(job, mutation_results, univ_options): def merge_perchrom_mutations(job, chrom, mutations, univ_options): """ - This module will accept job store ids for vcf files for all snvs, and will merge the calls for a - single provided chromosome. + Merge the mutation calls for a single chromosome. - :param job: job :param str chrom: Chromosome to process :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of - per chromosome job store ids for vcfs as value - :param dict univ_options: Universal Options - :returns dict of merged vcf + per chromosome job store ids for vcfs as value + :param dict univ_options: Dict of universal options used by almost all tools + :returns fsID for vcf contaning merged calls for the given chromosome + :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() from protect.mutation_calling.muse import process_muse_vcf @@ -125,10 +129,11 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options): def read_vcf(vcf_file): """ - Reads a vcf file to a list of lists + Read a vcf file to a dict of lists. - :param vcf_file: - :return: + :param str vcf_file: Path to a vcf file. + :return: dict of lists of vcf records + :rtype: dict """ vcf_dict = [] with open(vcf_file, 'r') as invcf: @@ -142,17 +147,18 @@ def read_vcf(vcf_file): def chrom_sorted(in_chroms): """ - This module will sort a list of chromosomes in the order 1..22, X, Y, M. - :param list in_chroms: input chromsomes - :return: sorted chromosomes - :rtype: list + Sort a list of chromosomes in the order 1..22, X, Y, M. + + :param list in_chroms: Input chromsomes + :return: Sorted chromosomes + :rtype: list[str] """ chr_prefix = False if in_chroms[0].startswith('chr'): in_chroms = [x.lstrip('chr') for x in in_chroms] chr_prefix = True - assert in_chroms[0] in [str(x) for x in range(1,23)] + ['X', 'Y', 'M'] - in_chroms = sorted(in_chroms, key=lambda x: int(x) if x not in ('X', 'Y', 'M') else x) + assert in_chroms[0] in [str(x) for x in range(1, 23)] + ['X', 'Y', 'M'] + in_chroms = sorted(in_chroms, key=lambda c: int(c) if c not in ('X', 'Y', 'M') else c) try: m_index = in_chroms.index('M') except ValueError: @@ -168,13 +174,13 @@ def chrom_sorted(in_chroms): def merge_perchrom_vcfs(job, perchrom_vcfs, tool_name, univ_options): """ - This module will merge per-chromosome vcf files into a single genome level vcf. + Merge per-chromosome vcf files into a single genome level vcf. - :param dict perchrom_vcfs: Dictionary with chromosome name as key and jobstore ID of - corresponding vcf as value + :param dict perchrom_vcfs: Dictionary with chromosome name as key and fsID of the corresponding + vcf as value :param str tool_name: Name of the tool that generated the vcfs - - :returns: Job Store File ID for the merged vcf + :returns: fsID for the merged vcf + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running merge_perchrom_vcfs for %s' % tool_name) work_dir = os.getcwd() @@ -199,13 +205,13 @@ def merge_perchrom_vcfs(job, perchrom_vcfs, tool_name, univ_options): def unmerge(job, input_vcf, tool_name, tool_options, univ_options): """ - Un-merges a vcf file into a file per chromosome. + Un-merge a vcf file into per-chromosome vcfs. :param str input_vcf: Input vcf :param str tool_name: The name of the mutation caller - :param dict tool_options: Options specific to Somatic Sniper - :param dict univ_options: Universal options - :returns: dict of jsIDs, onr for each chromosomal vcf + :param dict tool_options: Options specific to the mutation caller + :param dict univ_options: Dict of universal options used by almost all tools + :return: dict of fsIDs, one for each chromosomal vcf :rtype: dict """ work_dir = os.getcwd() @@ -243,4 +249,4 @@ def unmerge(job, input_vcf, tool_name, tool_options, univ_options): outdict[chrom] = job.fileStore.writeGlobalFile(chromvcf.name) export_results(job, outdict[chrom], chromvcf.name, univ_options, subfolder='mutations/' + tool_name) - return outdict \ No newline at end of file + return outdict diff --git a/src/protect/mutation_calling/fusion.py b/src/protect/mutation_calling/fusion.py index 8d723879..d5ee9beb 100644 --- a/src/protect/mutation_calling/fusion.py +++ b/src/protect/mutation_calling/fusion.py @@ -17,10 +17,13 @@ def run_fusion_caller(job, star_bam, univ_options, fusion_options): """ - This module will run a fusion caller on DNA bams. This module will be - implemented in the future. + Run a fusion caller on DNA bams. This module will be implemented in the future. - This module corresponds to node 10 on the tree + :param toil.fileStore.FileID star_bam: The input RNA-Seq bam + :param dict univ_options: Dict of universal options used by almost all tools + :param dict fusion_options: Options specific to fusion calling + :return: fsID to the merged fusion calls + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running FUSION on %s' % univ_options['patient']) job.fileStore.logToMaster('Fusions are currently unsupported.... Skipping.') diff --git a/src/protect/mutation_calling/indel.py b/src/protect/mutation_calling/indel.py index 35b07f0e..41f6624d 100644 --- a/src/protect/mutation_calling/indel.py +++ b/src/protect/mutation_calling/indel.py @@ -17,10 +17,14 @@ def run_indel_caller(job, tumor_bam, normal_bam, univ_options, indel_options): """ - This module will run an indel caller on the DNA bams. This module will be - implemented in the future. + Run an indel caller on the DNA bams. This module will be implemented in the future. - This module corresponds to node 13 on the tree + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict indel_options: Options specific to indel calling + :return: fsID to the merged fusion calls + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running INDEL on %s' % univ_options['patient']) job.fileStore.logToMaster('INDELs are currently unsupported.... Skipping.') diff --git a/src/protect/mutation_calling/muse.py b/src/protect/mutation_calling/muse.py index d6503520..a634782f 100644 --- a/src/protect/mutation_calling/muse.py +++ b/src/protect/mutation_calling/muse.py @@ -15,14 +15,17 @@ from __future__ import print_function from collections import defaultdict from math import ceil -from protect.common import (get_files_from_filestore, docker_path, docker_call, untargz, - export_results) -from protect.mutation_calling.common import sample_chromosomes, merge_perchrom_vcfs + +from protect.common import (docker_call, + docker_path, + export_results, + get_files_from_filestore, + untargz) +from protect.mutation_calling.common import merge_perchrom_vcfs, sample_chromosomes from toil.job import PromisedRequirement import os import shutil -import sys import time @@ -39,9 +42,16 @@ def muse_sump_disk(dbsnp): def run_muse_with_merge(job, tumor_bam, normal_bam, univ_options, muse_options): """ - This is a convenience function that runs the entire muse sub-graph. + A wrapper for the the entire MuSE sub-graph. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict muse_options: Options specific to MuSE + :return: fsID to the merged MuSE calls + :rtype: toil.fileStore.FileID """ - spawn = job.wrapJobFn(run_muse, tumor_bam, normal_bam, univ_options,muse_options, + spawn = job.wrapJobFn(run_muse, tumor_bam, normal_bam, univ_options, muse_options, disk='100M').encapsulate() merge = job.wrapJobFn(merge_perchrom_vcfs, spawn.rv(), disk='100M') job.addChild(spawn) @@ -51,51 +61,32 @@ def run_muse_with_merge(job, tumor_bam, normal_bam, univ_options, muse_options): def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options): """ - This module will spawn a muse job for each chromosome on the DNA bams. - - ARGUMENTS - 1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai - tumor_bam - |- 'tumor_fix_pg_sorted.bam': - +- 'tumor_fix_pg_sorted.bam.bai': - 2. normal_bam: Dict of input normal WGS/WSQ bam + bai - normal_bam - |- 'normal_fix_pg_sorted.bam': - +- 'normal_fix_pg_sorted.bam.bai': - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 4. muse_options: Dict of parameters specific to muse - muse_options - |- 'dbsnp_vcf': - |- 'dbsnp_idx': - |- 'cosmic_vcf': - |- 'cosmic_idx': - |- 'genome_fasta': - +- 'genome_dict': - +- 'genome_fai': - - RETURN VALUES - 1. perchrom_muse: Dict of results of muse per chromosome - perchrom_muse - |- 'chr1' - | +- - |- 'chr2' - | +- - etc... - - This module corresponds to node 11 on the tree + Spawn a MuSE job for each chromosome on the DNA bams. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict muse_options: Options specific to MuSE + :return: Dict of results from running MuSE on every chromosome + perchrom_muse: + |- 'chr1': fsID + |- 'chr2' fsID + | + |-... + | + +- 'chrM': fsID + :rtype: dict """ # Get a list of chromosomes to handle chromosomes = sample_chromosomes(job, muse_options['genome_fai']) perchrom_muse = defaultdict() for chrom in chromosomes: call = job.addChildJobFn(run_muse_perchrom, tumor_bam, normal_bam, univ_options, - muse_options, chrom, - disk=PromisedRequirement(muse_disk, - tumor_bam['tumor_dna_fix_pg_sorted.bam'], - normal_bam['normal_dna_fix_pg_sorted.bam'], - muse_options['genome_fasta']), + muse_options, chrom, disk=PromisedRequirement( + muse_disk, + tumor_bam['tumor_dna_fix_pg_sorted.bam'], + normal_bam['normal_dna_fix_pg_sorted.bam'], + muse_options['genome_fasta']), memory='6G') sump = call.addChildJobFn(run_muse_sump_perchrom, call.rv(), univ_options, muse_options, chrom, @@ -108,21 +99,17 @@ def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options): def run_muse_perchrom(job, tumor_bam, normal_bam, univ_options, muse_options, chrom): """ - This module will run muse on the DNA bams - - ARGUMENTS - 1. tumor_bam: REFER ARGUMENTS of spawn_muse() - 2. normal_bam: REFER ARGUMENTS of spawn_muse() - 3. univ_options: REFER ARGUMENTS of spawn_muse() - 4. muse_options: REFER ARGUMENTS of spawn_muse() - 5. chrom: String containing chromosome name with chr appended - - RETURN VALUES - 1. output_files: - - This module corresponds to node 12 on the tree + Run MuSE call on a single chromosome in the input bams. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict muse_options: Options specific to MuSE + :param str chrom: Chromosome to process + :return: fsID for the chromsome vcf + :rtype: toil.fileStore.FileID """ - job.fileStore.logToMaster('Running muse on %s:%s' % (univ_options['patient'], chrom)) + job.fileStore.logToMaster('Running MuSE on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'], @@ -153,9 +140,16 @@ def run_muse_perchrom(job, tumor_bam, normal_bam, univ_options, muse_options, ch def run_muse_sump_perchrom(job, muse_output, univ_options, muse_options, chrom): """ - This module will run muse sump on the muse output + Run MuSE sump on the MuSE call generated vcf. + + :param toil.fileStore.FileID muse_output: vcf generated by MuSE call + :param dict univ_options: Dict of universal options used by almost all tools + :param dict muse_options: Options specific to MuSE + :param str chrom: Chromosome to process + :return: fsID for the chromsome vcf + :rtype: toil.fileStore.FileID """ - job.fileStore.logToMaster('Running muse sump on %s:%s' % (univ_options['patient'], chrom)) + job.fileStore.logToMaster('Running MuSE sump on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'MuSE.txt': muse_output, @@ -185,11 +179,13 @@ def run_muse_sump_perchrom(job, muse_output, univ_options, muse_options, chrom): def process_muse_vcf(job, muse_vcf, work_dir, univ_options): """ - This does nothing for now except to download and return the muse vcfs - :param job: job - :param str muse_vcf: Job Store ID corresponding to a muse vcf for 1 chromosome - :param univ_options: Universal options - :returns dict: Dict with chromosomes as keys and path to the corresponding muse vcfs as values + Process the MuSE vcf for accepted calls. + + :param toil.fileStore.FileID muse_vcf: fsID for a MuSE generated chromosome vcf + :param str work_dir: Working directory + :param dict univ_options: Dict of universal options used by almost all tools + :return: Path to the processed vcf + :rtype: str """ muse_vcf = job.fileStore.readGlobalFile(muse_vcf) diff --git a/src/protect/mutation_calling/mutect.py b/src/protect/mutation_calling/mutect.py index be1cc1a2..4647e2f2 100644 --- a/src/protect/mutation_calling/mutect.py +++ b/src/protect/mutation_calling/mutect.py @@ -15,18 +15,20 @@ from __future__ import print_function from collections import defaultdict from math import ceil -from protect.common import get_files_from_filestore, docker_path, docker_call, export_results, \ - untargz, gunzip + +from protect.common import (docker_call, + docker_path, + export_results, + get_files_from_filestore, + gunzip, + untargz) from protect.mutation_calling.common import sample_chromosomes, merge_perchrom_vcfs +from toil.job import PromisedRequirement import os -import sys # disk for mutect. -from toil.job import PromisedRequirement - - def mutect_disk(tumor_bam, normal_bam, fasta, dbsnp, cosmic): return int(ceil(tumor_bam.size) + ceil(normal_bam.size) + @@ -37,7 +39,14 @@ def mutect_disk(tumor_bam, normal_bam, fasta, dbsnp, cosmic): def run_mutect_with_merge(job, tumor_bam, normal_bam, univ_options, mutect_options): """ - This is a convenience function that runs the entire mutect sub-graph. + A wrapper for the the entire MuTect sub-graph. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict mutect_options: Options specific to MuTect + :return: fsID to the merged MuTect calls + :rtype: toil.fileStore.FileID """ spawn = job.wrapJobFn(run_mutect, tumor_bam, normal_bam, univ_options, mutect_options).encapsulate() @@ -49,42 +58,21 @@ def run_mutect_with_merge(job, tumor_bam, normal_bam, univ_options, mutect_optio def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options): """ - This module will spawn a mutect job for each chromosome on the DNA bams. - - ARGUMENTS - 1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai - tumor_bam - |- 'tumor_fix_pg_sorted.bam': - +- 'tumor_fix_pg_sorted.bam.bai': - 2. normal_bam: Dict of input normal WGS/WSQ bam + bai - normal_bam - |- 'normal_fix_pg_sorted.bam': - +- 'normal_fix_pg_sorted.bam.bai': - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 4. mutect_options: Dict of parameters specific to mutect - mutect_options - |- 'dbsnp_vcf': - |- 'dbsnp_idx': - |- 'cosmic_vcf': - |- 'cosmic_idx': - |- 'genome_fasta': - +- 'genome_dict': - +- 'genome_fai': - - RETURN VALUES - 1. perchrom_mutect: Dict of results of mutect per chromosome - perchrom_mutect - |- 'chr1' - | +- 'mutect_chr1.vcf': - | +- 'mutect_chr1.out': - |- 'chr2' - | |- 'mutect_chr2.vcf': - | +- 'mutect_chr2.out': - etc... - - This module corresponds to node 11 on the tree + Spawn a MuTect job for each chromosome on the DNA bams. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict mutect_options: Options specific to MuTect + :return: Dict of results from running MuTect on every chromosome + perchrom_mutect: + |- 'chr1': fsID + |- 'chr2' fsID + | + |-... + | + +- 'chrM': fsID + :rtype: dict """ # Get a list of chromosomes to handle chromosomes = sample_chromosomes(job, mutect_options['genome_fai']) @@ -103,24 +91,17 @@ def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options): def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options, mutect_options, chrom): """ - This module will run mutect on the DNA bams - - ARGUMENTS - 1. tumor_bam: REFER ARGUMENTS of spawn_mutect() - 2. normal_bam: REFER ARGUMENTS of spawn_mutect() - 3. univ_options: REFER ARGUMENTS of spawn_mutect() - 4. mutect_options: REFER ARGUMENTS of spawn_mutect() - 5. chrom: String containing chromosome name with chr appended - - RETURN VALUES - 1. output_files: Dict of results of mutect for chromosome - output_files - |- 'mutect_CHROM.vcf': - +- 'mutect_CHROM.out': - - This module corresponds to node 12 on the tree + Run MuTect call on a single chromosome in the input bams. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict mutect_options: Options specific to MuTect + :param str chrom: Chromosome to process + :return: fsID for the chromsome vcf + :rtype: toil.fileStore.FileID """ - job.fileStore.logToMaster('Running mutect on %s:%s' % (univ_options['patient'], chrom)) + job.fileStore.logToMaster('Running MuTect on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'], @@ -158,7 +139,7 @@ def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options, mutect_options java_xmx = mutect_options['java_Xmx'] if mutect_options['java_Xmx'] \ else univ_options['java_Xmx'] docker_call(tool='mutect', tool_parameters=parameters, work_dir=work_dir, - dockerhub=univ_options['dockerhub'], java_opts=java_xmx, + dockerhub=univ_options['dockerhub'], java_xmx=java_xmx, tool_version=mutect_options['version']) output_file = job.fileStore.writeGlobalFile(mutvcf) export_results(job, output_file, mutvcf, univ_options, subfolder='mutations/mutect') @@ -167,12 +148,13 @@ def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options, mutect_options def process_mutect_vcf(job, mutect_vcf, work_dir, univ_options): """ - This will process all the mutect vcfs to return only passing calls - :param job: job - :param str mutect_vcf: Job Store ID corresponding to a mutect vcf for 1 chromosome - :param univ_options: Universal options - :returns dict: Dict with chromosomes as keys and path to the corresponding parsed mutect vcfs as - values + Process the MuTect vcf for accepted calls. + + :param toil.fileStore.FileID mutect_vcf: fsID for a MuTect generated chromosome vcf + :param str work_dir: Working directory + :param dict univ_options: Dict of universal options used by almost all tools + :return: Path to the processed vcf + :rtype: str """ mutect_vcf = job.fileStore.readGlobalFile(mutect_vcf) diff --git a/src/protect/mutation_calling/radia.py b/src/protect/mutation_calling/radia.py index 1456f873..6be03c40 100644 --- a/src/protect/mutation_calling/radia.py +++ b/src/protect/mutation_calling/radia.py @@ -15,18 +15,20 @@ from __future__ import print_function from collections import defaultdict from math import ceil -from protect.common import (get_files_from_filestore, docker_path, docker_call, export_results, + +from protect.common import (docker_path, + docker_call, + export_results, + get_files_from_filestore, untargz) from protect.mutation_calling.common import sample_chromosomes, merge_perchrom_vcfs +from toil.job import PromisedRequirement import os import sys # disk for radia and filterradia. -from toil.job import PromisedRequirement - - def radia_disk(tumor_bam, normal_bam, rna_bam, fasta): return int(ceil(tumor_bam.size) + ceil(normal_bam.size) + @@ -36,10 +38,20 @@ def radia_disk(tumor_bam, normal_bam, rna_bam, fasta): def run_radia_with_merge(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options): """ - This is a convenience function that runs the entire mutect sub-graph. + A wrapper for the the entire RADIA sub-graph. + + :param dict rna_bam: Dict dicts of bam and bai for tumor RNA-Seq obtained by running STAR within + ProTECT. + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict radia_options: Options specific to RADIA + :return: fsID to the merged RADIA calls + :rtype: toil.fileStore.FileID """ - spawn = job.wrapJobFn(run_radia, rna_bam, tumor_bam, normal_bam, univ_options, - radia_options, disk='100M', memory='100M').encapsulate() + spawn = job.wrapJobFn(run_radia, rna_bam['rnaAligned.sortedByCoord.out.bam'], tumor_bam, + normal_bam, univ_options, radia_options, disk='100M', + memory='100M').encapsulate() merge = job.wrapJobFn(merge_perchrom_vcfs, spawn.rv(), univ_options, disk='100M', memory='100M') job.addChild(spawn) spawn.addChild(merge) @@ -48,45 +60,43 @@ def run_radia_with_merge(job, rna_bam, tumor_bam, normal_bam, univ_options, radi def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options): """ - This module will spawn a radia job for each chromosome, on the RNA and DNA. + Spawn a RADIA job for each chromosome on the input bam trios. - ARGUMENTS - 1. rna_bam: Dict of input STAR bams - rna_bam - |- 'rnaAligned.sortedByCoord.out.bam': REFER run_star() - |- 'rna_fix_pg_sorted.bam': - +- 'rna_fix_pg_sorted.bam.bai': - 2. tumor_bam: Dict of input tumor WGS/WSQ bam + bai - tumor_bam - |- 'tumor_fix_pg_sorted.bam': - +- 'tumor_fix_pg_sorted.bam.bai': - 3. normal_bam: Dict of input normal WGS/WSQ bam + bai - normal_bam - |- 'normal_fix_pg_sorted.bam': - +- 'normal_fix_pg_sorted.bam.bai': - 4. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 5. radia_options: Dict of parameters specific to radia - radia_options - |- 'genome_fasta': - +- 'genome_fai': - - RETURN VALUES - 1. perchrom_radia: Dict of results of radia per chromosome - perchrom_radia - |- 'chr1' - | +- 'radia_filtered_chr1.vcf': - |- 'chr2' - | +- 'radia_filtered_chr2.vcf': - etc... - - This module corresponds to node 11 on the tree + :param dict rna_bam: Dict of bam and bai for tumor DNA-Seq. It can be one of two formats + rna_bam: # Just the genomic bam and bai + |- 'rna_fix_pg_sorted.bam': fsID + +- 'rna_fix_pg_sorted.bam.bai': fsID + OR + rna_bam: # The output from run_star + |- 'rnaAligned.toTranscriptome.out.bam': fsID + |- 'rnaAligned.sortedByCoord.out.bam': # Only this part will be used + |- 'rna_fix_pg_sorted.bam': fsID + +- 'rna_fix_pg_sorted.bam.bai': fsID + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict radia_options: Options specific to RADIA + :return: Dict of results from running RADIA on every chromosome + perchrom_radia: + |- 'chr1': fsID + |- 'chr2' fsID + | + |-... + | + +- 'chrM': fsID + :rtype: dict """ job.fileStore.logToMaster('Running spawn_radia on %s' % univ_options['patient']) - rna_bam_key = 'rnaAligned.sortedByCoord.out.bam' # to reduce next line size - bams = {'tumor_rna': rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam'], - 'tumor_rnai': rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam.bai'], + if set(rna_bam.keys()) == {'rnaAligned.toTranscriptome.out.bam', + 'rnaAligned.sortedByCoord.out.bam'}: + rna_bam = rna_bam['rnaAligned.sortedByCoord.out.bam'] + elif set(rna_bam.keys()) == {'rna_fix_pg_sorted.bam', 'rna_fix_pg_sorted.bam.bai'}: + pass + else: + raise RuntimeError('An improperly formatted dict was passed to rna_bam.') + + bams = {'tumor_rna': rna_bam['rna_fix_pg_sorted.bam'], + 'tumor_rnai': rna_bam['rna_fix_pg_sorted.bam.bai'], 'tumor_dna': tumor_bam['tumor_dna_fix_pg_sorted.bam'], 'tumor_dnai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'], 'normal_dna': normal_bam['normal_dna_fix_pg_sorted.bam'], @@ -100,14 +110,14 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options): disk=PromisedRequirement( radia_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], - rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam'], + rna_bam['rna_fix_pg_sorted.bam'], radia_options['genome_fasta'])) filter_radia = radia.addChildJobFn(run_filter_radia, bams, radia.rv(), univ_options, radia_options, chrom, memory='6G', disk=PromisedRequirement( radia_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'], normal_bam['normal_dna_fix_pg_sorted.bam'], - rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam'], + rna_bam['rna_fix_pg_sorted.bam'], radia_options['genome_fasta'])) perchrom_radia[chrom] = filter_radia.rv() return perchrom_radia @@ -115,30 +125,14 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options): def run_radia_perchrom(job, bams, univ_options, radia_options, chrom): """ - This module will run radia on the RNA and DNA bams - - ARGUMENTS - 1. bams: Dict of bams and their indexes - bams - |- 'tumor_rna': - |- 'tumor_rnai': - |- 'tumor_dna': - |- 'tumor_dnai': - |- 'normal_dna': - +- 'normal_dnai': - 2. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 3. radia_options: Dict of parameters specific to radia - radia_options - |- 'dbsnp_vcf': - +- 'genome': - 4. chrom: String containing chromosome name with chr appended + Run RADIA call on a single chromosome in the input bams. - RETURN VALUES - 1. Dict of filtered radia output vcf and logfile (Nested return) - |- 'radia_filtered_CHROM.vcf': - +- 'radia_filtered_CHROM_radia.log': + :param dict bams: Dict of bam and bai for tumor DNA-Seq, normal DNA-Seq and tumor RNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict radia_options: Options specific to RADIA + :param str chrom: Chromosome to process + :return: fsID for the chromsome vcf + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running radia on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() @@ -183,17 +177,15 @@ def run_radia_perchrom(job, bams, univ_options, radia_options, chrom): def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom): """ - This module will run filterradia on the RNA and DNA bams. + Run filterradia on the RADIA output. - ARGUMENTS - 1. bams: REFER ARGUMENTS of run_radia() - 2. univ_options: REFER ARGUMENTS of run_radia() - 3. radia_file: - 3. radia_options: REFER ARGUMENTS of run_radia() - 4. chrom: REFER ARGUMENTS of run_radia() - - RETURN VALUES - 1. output_file: + :param dict bams: Dict of bam and bai for tumor DNA-Seq, normal DNA-Seq and tumor RNA-Seq + :param toil.fileStore.FileID radia_file: The vcf from runnning RADIA + :param dict univ_options: Dict of universal options used by almost all tools + :param dict radia_options: Options specific to RADIA + :param str chrom: Chromosome to process + :return: fsID for the filtered chromsome vcf + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster('Running filter-radia on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() @@ -252,14 +244,14 @@ def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom): def process_radia_vcf(job, radia_vcf, work_dir, univ_options): """ - This function will parse the vcf to detect sites having multiple alt alleles and pick out on the - most likely ones. + Process the RADIA vcf to for passing calls and additionally sites having multiple alt alleles + to pick out on the most likely ones. - :param job: job - :param str radia_vcf: Job Store ID corresponding to a radia vcf for 1 chromosome - :param univ_options: Universal options - :returns dict: Dict with chromosomes as keys and path to the corresponding parsed radia vcfs as - values + :param toil.fileStore.FileID radia_vcf: fsID for a RADIA generated chromosome vcf + :param str work_dir: Working directory + :param dict univ_options: Dict of universal options used by almost all tools + :return: Path to the processed vcf + :rtype: str """ radia_vcf = job.fileStore.readGlobalFile(radia_vcf) with open(radia_vcf, 'r') as infile, open(radia_vcf + 'radia_parsed.tmp', 'w') as outfile: @@ -373,4 +365,3 @@ def process_radia_vcf(job, radia_vcf, work_dir, univ_options): else: pass return outfile.name - diff --git a/src/protect/mutation_calling/somaticsniper.py b/src/protect/mutation_calling/somaticsniper.py index 0d551c0e..f2b117eb 100644 --- a/src/protect/mutation_calling/somaticsniper.py +++ b/src/protect/mutation_calling/somaticsniper.py @@ -15,12 +15,13 @@ from __future__ import print_function from collections import defaultdict from math import ceil -from protect.common import (get_files_from_filestore, - docker_path, + +from protect.common import (docker_path, docker_call, + get_files_from_filestore, untargz) -from protect.mutation_calling.common import (unmerge, - sample_chromosomes) +from protect.mutation_calling.common import (sample_chromosomes, + unmerge) from toil.job import PromisedRequirement import os @@ -43,10 +44,18 @@ def sniper_filter_disk(tumor_bam, fasta): 2 * ceil(fasta.size)) -# Somatic Sniper is a different tool from the other callers because it is run in full +# Somatic Sniper is a different tool from the other callers because it is run in full. Certain +# redundant modules in this file are kept to maintain a similar naming schema to other callers. def run_somaticsniper_with_merge(job, tumor_bam, normal_bam, univ_options, somaticsniper_options): """ - This is a convenience function that runs the entire somaticsniper sub-graph. + A wrapper for the the entire SomaticSniper sub-graph. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict somaticsniper_options: Options specific to SomaticSniper + :return: fsID to the merged SomaticSniper calls + :rtype: toil.fileStore.FileID """ spawn = job.wrapJobFn(run_somaticsniper, tumor_bam, normal_bam, univ_options, somaticsniper_options, split=False).encapsulate() @@ -56,40 +65,24 @@ def run_somaticsniper_with_merge(job, tumor_bam, normal_bam, univ_options, somat def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_options, split=True): """ - This module will spawn a somaticsniper job for each chromosome on the DNA bams. - - ARGUMENTS - 1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai - tumor_bam - |- 'tumor_fix_pg_sorted.bam': - +- 'tumor_fix_pg_sorted.bam.bai': - 2. normal_bam: Dict of input normal WGS/WSQ bam + bai - normal_bam - |- 'normal_fix_pg_sorted.bam': - +- 'normal_fix_pg_sorted.bam.bai': - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 4. somaticsniper_options: Dict of parameters specific to somaticsniper - somaticsniper_options - |- 'dbsnp_vcf': - |- 'dbsnp_idx': - |- 'cosmic_vcf': - |- 'cosmic_idx': - |- 'genome_fasta': - +- 'genome_dict': - +- 'genome_fai': - - RETURN VALUES - 1. perchrom_somaticsniper: Dict of results of somaticsniper per chromosome - perchrom_somaticsniper - |- 'chr1' - | +- - |- 'chr2' - | +- - etc... - - This module corresponds to node 11 on the tree + Run the SomaticSniper subgraph on the DNA bams. Optionally split the results into + per-chromosome vcfs. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict somaticsniper_options: Options specific to SomaticSniper + :param bool split: Should the results be split into perchrom vcfs? + :return: Either the fsID to the genome-level vcf or a dict of results from running SomaticSniper + on every chromosome + perchrom_somaticsniper: + |- 'chr1': fsID + |- 'chr2' fsID + | + |-... + | + +- 'chrM': fsID + :rtype: toil.fileStore.FileID|dict """ # Get a list of chromosomes to handle chromosomes = sample_chromosomes(job, somaticsniper_options['genome_fai']) @@ -128,19 +121,16 @@ def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_op def run_somaticsniper_full(job, tumor_bam, normal_bam, univ_options, somaticsniper_options): """ - This module will run somaticsniper on the DNA bams. - - ARGUMENTS - :param dict tumor_bam: REFER ARGUMENTS of spawn_somaticsniper() - :param dict normal_bam: REFER ARGUMENTS of spawn_somaticsniper() - :param dict univ_options: REFER ARGUMENTS of spawn_somaticsniper() - :param dict somaticsniper_options: REFER ARGUMENTS of spawn_somaticsniper() - - RETURN VALUES - :returns: dict of output vcfs for each chromosome - :rtype: dict + Run SomaticSniper on the DNA bams. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict somaticsniper_options: Options specific to SomaticSniper + :return: fsID to the genome-level vcf + :rtype: toil.fileStore.FileID """ - job.fileStore.logToMaster('Running somaticsniper on %s' % univ_options['patient']) + job.fileStore.logToMaster('Running SomaticSniper on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'], @@ -174,18 +164,17 @@ def run_somaticsniper_full(job, tumor_bam, normal_bam, univ_options, somaticsnip def filter_somaticsniper(job, tumor_bam, somaticsniper_output, tumor_pileup, univ_options, somaticsniper_options): """ - This module will filter the somaticsniper output for a single chromosome - - :param toil.Job job: Job - :param dict tumor_bam: Tumor bam file and it's bai - :param str somaticsniper_output: jsID from somatic sniper - :param str tumor_pileup: jsID for pileup file for this chromsome - :param dict univ_options: Universal options - :param dict somaticsniper_options: Options specific to Somatic Sniper - :returns: filtered chromsome vcf - :rtype: str + Filter SomaticSniper calls. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param toil.fileStore.FileID somaticsniper_output: SomaticSniper output vcf + :param toil.fileStore.FileID tumor_pileup: Pileup generated for the tumor bam + :param dict univ_options: Dict of universal options used by almost all tools + :param dict somaticsniper_options: Options specific to SomaticSniper + :returns: fsID for the filtered genome-level vcf + :rtype: toil.fileStore.FileID """ - job.fileStore.logToMaster('Filtering somaticsniper for %s' % univ_options['patient']) + job.fileStore.logToMaster('Filtering SomaticSniper for %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'], @@ -251,12 +240,15 @@ def filter_somaticsniper(job, tumor_bam, somaticsniper_output, tumor_pileup, uni def process_somaticsniper_vcf(job, somaticsniper_vcf, work_dir, univ_options): """ - This does nothing for now except to download and return the somaticsniper vcfs - :param job: job - :param str somaticsniper_vcf: Job Store ID corresponding to a somaticsniper vcf for 1 chromosome - :param univ_options: Universal options - :returns dict: Dict with chromosomes as keys and path to the corresponding somaticsniper vcfs as - values + Process the SomaticSniper vcf for accepted calls. Since the calls are pre-filtered, we just + obtain the file from the file store and return it. + + :param toil.fileStore.FileID somaticsniper_vcf: fsID for a SomaticSniper generated chromosome + vcf + :param str work_dir: Working directory + :param dict univ_options: Dict of universal options used by almost all tools + :return: Path to the processed vcf + :rtype: str """ return job.fileStore.readGlobalFile(somaticsniper_vcf) @@ -265,11 +257,11 @@ def run_pileup(job, tumor_bam, univ_options, somaticsniper_options): """ Runs a samtools pileup on the tumor bam. - :param toil.Job job: job - :param dict tumor_bam: Tumor bam file - :param dict univ_options: Universal Options - :returns: jsID for the chromsome pileup file - :rtype: str + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict somaticsniper_options: Options specific to SomaticSniper + :return: fsID for the pileup file + :rtype: toil.fileStore.FileID """ job.fileStore.logToMaster( 'Running samtools pileup on %s' % univ_options['patient']) diff --git a/src/protect/mutation_calling/strelka.py b/src/protect/mutation_calling/strelka.py index 67ee75db..eca70109 100644 --- a/src/protect/mutation_calling/strelka.py +++ b/src/protect/mutation_calling/strelka.py @@ -14,6 +14,7 @@ # limitations under the License. from __future__ import print_function from math import ceil + from protect.common import (docker_call, docker_path, get_files_from_filestore, @@ -32,10 +33,18 @@ def strelka_disk(tumor_bam, normal_bam, fasta): 4 * ceil(fasta.size)) -# Strelka is a different tool from the other callers because it is run in full +# Strelka is a different tool from the other callers because it is run in full. Certain +# redundant modules in this file are kept to maintain a similar naming schema to other callers. def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_options): """ - This is a convenience function that runs the entire strelka sub-graph. + A wrapper for the the entire strelka sub-graph. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict strelka_options: Options specific to strelka + :return: fsID to the merged strelka calls + :rtype: toil.fileStore.FileID """ spawn = job.wrapJobFn(run_strelka, tumor_bam, normal_bam, univ_options, strelka_options, split=False).encapsulate() @@ -45,42 +54,29 @@ def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_opt def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split=True): """ - This module will spawn a strelka job for each chromosome on the DNA bams. - - ARGUMENTS - 1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai - tumor_bam - |- 'tumor_fix_pg_sorted.bam': - +- 'tumor_fix_pg_sorted.bam.bai': - 2. normal_bam: Dict of input normal WGS/WSQ bam + bai - normal_bam - |- 'normal_fix_pg_sorted.bam': - +- 'normal_fix_pg_sorted.bam.bai': - 3. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 4. strelka_options: Dict of parameters specific to strelka - strelka_options - |- 'dbsnp_vcf': - |- 'dbsnp_idx': - |- 'cosmic_vcf': - |- 'cosmic_idx': - |- 'genome_fasta': - +- 'genome_dict': - +- 'genome_fai': - - RETURN VALUES - 1. perchrom_strelka: Dict of results of strelka per chromosome - perchrom_strelka - |- 'chr1' - | +- 'strelka_chr1.vcf': - | +- 'strelka_chr1.out': - |- 'chr2' - | |- 'strelka_chr2.vcf': - | +- 'strelka_chr2.out': - etc... - - This module corresponds to node 11 on the tree + Run the strelka subgraph on the DNA bams. Optionally split the results into per-chromosome + vcfs. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict strelka_options: Options specific to strelka + :param bool split: Should the results be split into perchrom vcfs? + :return: Either the fsID to the genome-level vcf or a dict of results from running strelka + on every chromosome + perchrom_strelka: + |- 'chr1': + | |-'snvs': fsID + | +-'indels': fsID + |- 'chr2': + | |-'snvs': fsID + | +-'indels': fsID + |-... + | + +- 'chrM': + |-'snvs': fsID + +-'indels': fsID + :rtype: toil.fileStore.FileID|dict """ chromosomes = sample_chromosomes(job, strelka_options['genome_fai']) num_cores = min(len(chromosomes), univ_options['max_cores']) @@ -104,16 +100,16 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split def run_strelka_full(job, tumor_bam, normal_bam, univ_options, strelka_options): """ - This module will run strelka on the DNA bams. - - ARGUMENTS - :param dict tumor_bam: REFER ARGUMENTS of spawn_strelka() - :param dict normal_bam: REFER ARGUMENTS of spawn_strelka() - :param dict univ_options: REFER ARGUMENTS of spawn_strelka() - :param dict strelka_options: REFER ARGUMENTS of spawn_strelka() - - RETURN VALUES - :returns: dict of output vcfs for each chromosome + Run strelka on the DNA bams. + + :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq + :param dict normal_bam: Dict of bam and bai for normal DNA-Seq + :param dict univ_options: Dict of universal options used by almost all tools + :param dict strelka_options: Options specific to strelka + :return: Dict of fsIDs snv and indel prediction files + output_dict: + |-'snvs': fsID + +-'indels': fsID :rtype: dict """ job.fileStore.logToMaster('Running strelka on %s' % univ_options['patient']) @@ -150,24 +146,38 @@ def run_strelka_full(job, tumor_bam, normal_bam, univ_options, strelka_options): def process_strelka_vcf(job, strelka_vcf, work_dir, univ_options): """ - This will process all the strelka vcfs to return only passing calls - :param job: job - :param str strelka_vcf: Job Store ID corresponding to a strelka vcf for 1 chromosome - :param univ_options: Universal options - :returns dict: Dict with chromosomes as keys and path to the corresponding parsed strelka vcfs as - values + Process the strelka vcf for accepted calls. Since the calls are pre-filtered, we just obtain the + file from the file store and return it. + + :param toil.fileStore.FileID strelka_vcf: fsID for a strelka generated chromosome vcf + :param str work_dir: Working directory + :param dict univ_options: Dict of universal options used by almost all tools + :return: Path to the processed vcf + :rtype: str """ return job.fileStore.readGlobalFile(strelka_vcf) def wrap_unmerge(job, strelka_out, strelka_options, univ_options): """ - This is a convenience function that wraps the unmerge for strelkas snvs and indels - - :param toil.Job job: job - :param dict strelka_out: - :param dict strelka_options: - :param dict univ_options: + A wwrapper to unmerge the strelka snvs and indels + + :param dict strelka_out: Results from run_strelka + :param dict strelka_options: Options specific to strelka + :param dict univ_options: Dict of universal options used by almost all tools + :return: Dict of dicts containing the fsIDs for the per-chromosome snv and indel calls + output: + |- 'snvs': + | |- 'chr1': fsID + | |- 'chr2': fsID + | |- ... + | +- 'chrM': fsID + +- 'indels': + |- 'chr1': fsID + |- 'chr2': fsID + |- ... + +- 'chrM': fsID + :rtype: dict """ return {'snvs': job.addChildJobFn(unmerge, strelka_out['snvs'], 'strelka/snv', strelka_options, univ_options).rv(), diff --git a/src/protect/mutation_translation.py b/src/protect/mutation_translation.py index 534feda0..82c44306 100644 --- a/src/protect/mutation_translation.py +++ b/src/protect/mutation_translation.py @@ -13,35 +13,40 @@ # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import, print_function + from collections import defaultdict -from protect.common import docker_call, get_files_from_filestore, export_results, untargz, \ - docker_path + +from protect.common import (docker_call, + get_files_from_filestore, + export_results, + untargz, + docker_path) import os def run_transgene(job, snpeffed_file, rna_bam, univ_options, transgene_options): """ - This module will run transgene on the input vcf file from the aggregator and produce the - peptides for MHC prediction - - ARGUMENTS - 1. snpeffed_file: - 2. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 3. transgene_options: Dict of parameters specific to transgene - transgene_options - +- 'gencode_peptide_fasta': + Run transgene on an input snpeffed vcf filereturn the peptides for MHC prediction. - RETURN VALUES - 1. output_files: Dict of transgened n-mer peptide fastas - output_files - |- 'transgened_tumor_9_mer_snpeffed.faa': - |- 'transgened_tumor_10_mer_snpeffed.faa': - +- 'transgened_tumor_15_mer_snpeffed.faa': - This module corresponds to node 17 on the tree + :param toil.fileStore.FileID snpeffed_file: fsID for snpeffed vcf + :param dict rna_bam: The dict of bams returned by running star + :param dict univ_options: Dict of universal options used by almost all tools + :param dict transgene_options: Options specific to Transgene + :return: A dictionary of 9 files (9-, 10-, and 15-mer peptides each for Tumor and Normal and the + corresponding .map files for the 3 Tumor fastas) + output_files: + |- 'transgened_normal_10_mer_snpeffed.faa': fsID + |- 'transgened_normal_15_mer_snpeffed.faa': fsID + |- 'transgened_normal_9_mer_snpeffed.faa': fsID + |- 'transgened_tumor_10_mer_snpeffed.faa': fsID + |- 'transgened_tumor_10_mer_snpeffed.faa.map': fsID + |- 'transgened_tumor_15_mer_snpeffed.faa': fsID + |- 'transgened_tumor_15_mer_snpeffed.faa.map': fsID + |- 'transgened_tumor_9_mer_snpeffed.faa': fsID + +- 'transgened_tumor_9_mer_snpeffed.faa.map': fsID + :rtype: dict """ job.fileStore.logToMaster('Running transgene on %s' % univ_options['patient']) work_dir = os.getcwd() diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py index 5a45ee18..9d102579 100644 --- a/src/protect/pipeline/ProTECT.py +++ b/src/protect/pipeline/ProTECT.py @@ -21,16 +21,13 @@ Details can also be obtained by running the script with -h . """ from __future__ import print_function - - from multiprocessing import cpu_count -from urlparse import urlparse from protect.addons import run_mhc_gene_assessment from protect.alignment.dna import align_dna from protect.alignment.rna import align_rna -from protect.binding_prediction.common import spawn_antigen_predictors, merge_mhc_peptide_calls -from protect.common import delete_fastqs, ParameterError +from protect.binding_prediction.common import merge_mhc_peptide_calls, spawn_antigen_predictors +from protect.common import delete_fastqs, get_file_from_s3, get_file_from_url, ParameterError from protect.expression_profiling.rsem import wrap_rsem from protect.haplotyping.phlat import merge_phlat_calls, phlat_disk, run_phlat from protect.mutation_annotation.snpeff import run_snpeff, snpeff_disk @@ -46,14 +43,11 @@ from protect.qc.rna import cutadapt_disk, run_cutadapt from protect.rankboost import wrap_rankboost from toil.job import Job, PromisedRequirement -from protect.common import get_file_from_s3 -from protect.common import get_file_from_url import argparse import os import pkg_resources import shutil -import subprocess import sys import yaml @@ -62,6 +56,7 @@ def _ensure_set_contains(test_object, required_object, test_set_name=None): """ Ensure that the required entries (set or keys of a dict) are present in the test set or keys of the test dict. + :param set|dict test_object: The test set or dict :param set|dict required_object: The entries that need to be present in the test set (keys of input dict if input is dict) @@ -121,8 +116,9 @@ def _process_group(input_group, required_group, groupname, append_subgroups=None :param dict input_group: The dict of values of the input group :param dict required_group: The dict of required values for the input group :param str groupname: The name of the group being processed - :param list append_group: list of subgroups to append to each, other subgroup in this group - :return: dict + :param list append_subgroups: list of subgroups to append to each, other subgroup in this group + :return: processed dict of entries for the group + :rtype: dict """ if append_subgroups is None: append_subgroups = [] @@ -144,17 +140,12 @@ def _process_group(input_group, required_group, groupname, append_subgroups=None def _parse_config_file(job, config_file, max_cores=None): """ - This module will parse the config file withing params and set up the variables that will be - passed to the various tools in the pipeline. - - ARGUMENTS - config_file: string containing path to a config file. An example config - file is available at - https://s3-us-west-2.amazonaws.com/pimmuno-references - /input_parameters.list + Parse the input yaml config file and download all tool inputs into the file store. - RETURN VALUES - None + :param str config_file: Path to the input config file + :param int max_cores: The maximum cores to use for any single high-compute job. + :return: tuple of dict of sample dicts, dict of universal options, dict of dicts of tool options + :rtype: tuple(dict, dict, dict) """ job.fileStore.logToMaster('Parsing config file') config_file = os.path.abspath(config_file) @@ -166,7 +157,6 @@ def _parse_config_file(job, config_file, max_cores=None): univ_options = {} tool_options = {} - # Read in the input yaml with open(config_file, 'r') as conf: input_config = yaml.load(conf.read()) @@ -264,17 +254,10 @@ def _parse_config_file(job, config_file, max_cores=None): def parse_config_file(job, config_file, max_cores=None): """ - This module will parse the config file withing params and set up the variables that will be - passed to the various tools in the pipeline. - - ARGUMENTS - config_file: string containing path to a config file. An example config - file is available at - https://s3-us-west-2.amazonaws.com/pimmuno-references - /input_parameters.list + Parse the config file and spawn a ProTECT job for every input sample. - RETURN VALUES - None + :param str config_file: Path to the input config file + :param int max_cores: The maximum cores to use for any single high-compute job. """ sample_set, univ_options, processed_tool_inputs = _parse_config_file(job, config_file, max_cores) @@ -282,12 +265,19 @@ def parse_config_file(job, config_file, max_cores=None): for patient_id in sample_set.keys(): # Add the patient id to the sample set. Typecast to str so cat operations work later on. sample_set[patient_id]['patient_id'] = str(patient_id) - job.addFollowOnJobFn(pipeline_launchpad, sample_set[patient_id], univ_options, + job.addFollowOnJobFn(launch_protect, sample_set[patient_id], univ_options, processed_tool_inputs) return None def ascertain_cpu_share(max_cores=None): + """ + Ascertain the number of cpus allowed for each high-compte job instance (bwa, star, rsem, phlat). + + :param max_cores: The user-specified max + :return: The number of cpus allowed for each high-compte job instance + :rtype: int + """ # Ascertain the number of available CPUs. Jobs will be given fractions of this value. num_cores = cpu_count() # The minimum number of cpus should be at least 6 if possible @@ -298,15 +288,13 @@ def ascertain_cpu_share(max_cores=None): return cpu_share -def pipeline_launchpad(job, fastqs, univ_options, tool_options): +def launch_protect(job, fastqs, univ_options, tool_options): """ - The precision immuno pipeline begins at this module. The DAG can be viewed in Flowchart.txt + The launchpad for ProTECT. The DAG for ProTECT can be viewed in Flowchart.txt. - :param job job: job - :param dict fastqs: Dict of lists of fastq files - :param univ_options: Universal Options - :param tool_options: Options for the various tools - :return: None + :param dict fastqs: Dict of lists of fastq files and the patient ID + :param dict univ_options: Dict of universal options used by almost all tools + :param dict tool_options: Options for the various tools """ # Add Patient id to univ_options as is is passed to every major node in the DAG and can be used # as a prefix for the logfile. @@ -391,90 +379,90 @@ def pipeline_launchpad(job, fastqs, univ_options, tool_options): univ_options, tool_options['rankboost'], disk='100M', memory='100M', cores=1) # Define the DAG in a static form - job.addChild(sample_prep) # Edge 0->1 + job.addChild(sample_prep) # A. The first step is running the alignments and the MHC haplotypers - sample_prep.addChild(tumor_dna_fqs) # Edge 1->2 - sample_prep.addChild(normal_dna_fqs) # Edge 1->2 - sample_prep.addChild(tumor_rna_fqs) # Edge 1->2 + sample_prep.addChild(tumor_dna_fqs) + sample_prep.addChild(normal_dna_fqs) + sample_prep.addChild(tumor_rna_fqs) - tumor_rna_fqs.addChild(cutadapt) # Edge 1->2 - tumor_dna_fqs.addChild(bwa_tumor) # Edge 1->3 - normal_dna_fqs.addChild(bwa_normal) # Edge 1->4 + tumor_rna_fqs.addChild(cutadapt) + tumor_dna_fqs.addChild(bwa_tumor) + normal_dna_fqs.addChild(bwa_normal) - tumor_dna_fqs.addChild(phlat_tumor_dna) # Edge 1->5 - normal_dna_fqs.addChild(phlat_normal_dna) # Edge 1->6 - tumor_rna_fqs.addChild(phlat_tumor_rna) # Edge 1->7 + tumor_dna_fqs.addChild(phlat_tumor_dna) + normal_dna_fqs.addChild(phlat_normal_dna) + tumor_rna_fqs.addChild(phlat_tumor_rna) # B. cutadapt will be followed by star - cutadapt.addChild(star) # Edge 2->9 + cutadapt.addChild(star) # Ci. gene expression and fusion detection follow start alignment - star.addChild(rsem) # Edge 9->10 - star.addChild(fusions) # Edge 9->11 + star.addChild(rsem) + star.addChild(fusions) # Cii. Radia depends on all 3 alignments - star.addChild(radia) # Edge 9->12 - bwa_tumor.addChild(radia) # Edge 3->12 - bwa_normal.addChild(radia) # Edge 4->12 + star.addChild(radia) + bwa_tumor.addChild(radia) + bwa_normal.addChild(radia) # Ciii. mutect and indel calling depends on dna to have been aligned - bwa_tumor.addChild(mutect) # Edge 3->13 - bwa_normal.addChild(mutect) # Edge 4->13 - bwa_tumor.addChild(muse) # Edge 3->13 - bwa_normal.addChild(muse) # Edge 4->13 - bwa_tumor.addChild(somaticsniper) # Edge 3->13 - bwa_normal.addChild(somaticsniper) # Edge 4->13 - bwa_tumor.addChild(strelka) # Edge 3->13 - bwa_normal.addChild(strelka) # Edge 4->13 - bwa_tumor.addChild(indels) # Edge 3->14 - bwa_normal.addChild(indels) # Edge 4->14 + bwa_tumor.addChild(mutect) + bwa_normal.addChild(mutect) + bwa_tumor.addChild(muse) + bwa_normal.addChild(muse) + bwa_tumor.addChild(somaticsniper) + bwa_normal.addChild(somaticsniper) + bwa_tumor.addChild(strelka) + bwa_normal.addChild(strelka) + bwa_tumor.addChild(indels) + bwa_normal.addChild(indels) # D. MHC haplotypes will be merged once all 3 samples have been PHLAT-ed - phlat_tumor_dna.addChild(merge_phlat) # Edge 5->15 - phlat_normal_dna.addChild(merge_phlat) # Edge 6->15 - phlat_tumor_rna.addChild(merge_phlat) # Edge 7->15 + phlat_tumor_dna.addChild(merge_phlat) + phlat_normal_dna.addChild(merge_phlat) + phlat_tumor_rna.addChild(merge_phlat) # E. Delete the fastqs from the job store since all alignments are complete - sample_prep.addChild(fastq_deletion_1) # Edge 1->8 - cutadapt.addChild(fastq_deletion_1) # Edge 2->8 - bwa_normal.addChild(fastq_deletion_1) # Edge 3->8 - bwa_tumor.addChild(fastq_deletion_1) # Edge 4->8 - phlat_normal_dna.addChild(fastq_deletion_1) # Edge 5->8 - phlat_tumor_dna.addChild(fastq_deletion_1) # Edge 6>8 - phlat_tumor_rna.addChild(fastq_deletion_1) # Edge 7->8 + sample_prep.addChild(fastq_deletion_1) + cutadapt.addChild(fastq_deletion_1) + bwa_normal.addChild(fastq_deletion_1) + bwa_tumor.addChild(fastq_deletion_1) + phlat_normal_dna.addChild(fastq_deletion_1) + phlat_tumor_dna.addChild(fastq_deletion_1) + phlat_tumor_rna.addChild(fastq_deletion_1) star.addChild(fastq_deletion_2) # F. Mutation calls need to be merged before they can be used # G. All mutations get aggregated when they have finished running - fusions.addChild(merge_mutations) # Edge 11->18 - radia.addChild(merge_mutations) # Edge 16->18 - mutect.addChild(merge_mutations) # Edge 17->18 - muse.addChild(merge_mutations) # Edge 17->18 - somaticsniper.addChild(merge_mutations) # Edge 17->18 - strelka.addChild(merge_mutations) # Edge 17->18 - indels.addChild(merge_mutations) # Edge 14->18 + fusions.addChild(merge_mutations) + radia.addChild(merge_mutations) + mutect.addChild(merge_mutations) + muse.addChild(merge_mutations) + somaticsniper.addChild(merge_mutations) + strelka.addChild(merge_mutations) + indels.addChild(merge_mutations) # H. Aggregated mutations will be translated to protein space - merge_mutations.addChild(snpeff) # Edge 18->19 + merge_mutations.addChild(snpeff) # I. snpeffed mutations will be converted into peptides. # Transgene also accepts the RNA-seq bam and bai so that it can be rna-aware - snpeff.addChild(transgene) # Edge 19->20 + snpeff.addChild(transgene) star.addChild(transgene) # J. Merged haplotypes and peptides will be converted into jobs and submitted for mhc:peptide # binding prediction - merge_phlat.addChild(spawn_mhc) # Edge 15->21 - transgene.addChild(spawn_mhc) # Edge 20->21 + merge_phlat.addChild(spawn_mhc) + transgene.addChild(spawn_mhc) # K. The results from all the predictions will be merged. This is a follow-on job because # spawn_mhc will spawn an undetermined number of children. - spawn_mhc.addFollowOn(merge_mhc) # Edges 21->XX->22 and 21->YY->22 + spawn_mhc.addFollowOn(merge_mhc) # L. Finally, the merged mhc along with the gene expression will be used for rank boosting - rsem.addChild(rankboost) # Edge 10->23 - merge_mhc.addChild(rankboost) # Edge 22->23 + rsem.addChild(rankboost) + merge_mhc.addChild(rankboost) # M. Assess the status of the MHC genes in the patient - phlat_tumor_rna.addChild(mhc_pathway_assessment) # Edge 7->24 - rsem.addChild(mhc_pathway_assessment) # Edge 10->24 + phlat_tumor_rna.addChild(mhc_pathway_assessment) + rsem.addChild(mhc_pathway_assessment) return None def get_all_tool_inputs(job, tools): """ - This function will iterate through all the tool options and download the rquired file from their - remote locations. + Iterate through all the tool options and download required files from their remote locations. :param dict tools: A dict of dicts of all tools, and their options - :returns dict: The fully resolved tool dictionary + :return: The fully resolved tool dictionary + :rtype: dict """ for tool in tools: for option in tools[tool]: @@ -496,22 +484,23 @@ def get_all_tool_inputs(job, tools): def get_pipeline_inputs(job, input_flag, input_file, encryption_key=None, per_file_encryption=False): """ - Get the input file from s3 or disk, untargz if necessary and then write to file job store. - :param job: job + Get the input file from s3 or disk and write to file store. + :param str input_flag: The name of the flag :param str input_file: The value passed in the config file - :param str encryption_key: Path to the encryption key + :param str encryption_key: Path to the encryption key if encrypted with sse-c :param bool per_file_encryption: If encrypted, was the file encrypted using the per-file method? - :return: The jobstore ID for the file - :rtype: str + :return: fsID for the file + :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() job.fileStore.logToMaster('Obtaining file (%s) to the file job store' % os.path.basename(input_file)) if input_file.startswith(('http', 'https', 'ftp')): - input_file=get_file_from_url(job, input_file,encryption_key=encryption_key, - per_file_encryption=per_file_encryption, write_to_jobstore=False) - elif input_file.startswith(('S3','s3')): + input_file = get_file_from_url(job, input_file, encryption_key=encryption_key, + per_file_encryption=per_file_encryption, + write_to_jobstore=False) + elif input_file.startswith(('S3', 's3')): input_file = get_file_from_s3(job, input_file, write_to_jobstore=False, encryption_key=encryption_key, per_file_encryption=per_file_encryption) @@ -522,27 +511,28 @@ def get_pipeline_inputs(job, input_flag, input_file, encryption_key=None, def prepare_samples(job, fastqs, univ_options): """ - This module will accept a dict object holding the 3 input prefixes and the patient id and will - attempt to store the fastqs to the jobstore. The input files must satisfy the following - 1. File extensions can only be fastq or fq (.gz is also allowed) - 2. Forward and reverse reads MUST be in the same folder with the same prefix - 3. The files must be on the form + Obtain the 6 input fastqs and write them to the file store. The input files must satisfy the + following conditions: + 1. Files must be on the form 1.[.gz] 2.[.gz] - The input dict is: - tumor_dna: prefix_to_tumor_dna - tumor_rna: prefix_to_tumor_rna - normal_dna: prefix_to_normal_dna - patient_id: patient ID - - The input dict is updated to - tumor_dna: [jobStoreID_for_fastq1, jobStoreID_for_fastq2] - tumor_rna: [jobStoreID_for_fastq1, jobStoreID_for_fastq2] - normal_dna: [jobStoreID_for_fastq1, jobStoreID_for_fastq2] - patient_id: patient ID - gzipped: True/False - - This module corresponds to node 1 in the tree + 2. Forward and reverse reads MUST be in the same folder with the same prefix + + :param dict fastqs: The input fastq dict + fastqs: + |- 'tumor_dna': str + |- 'tumor_rna': str + |- 'normal_dna': str + +- 'patient_id': str + :param dict univ_options: Dict of universal options used by almost all tools + :return: Updated fastq dict + sample_fastqs: + |- 'tumor_dna': [fsID, fsID] + |- 'normal_dna': [fsID, fsID] + |- 'tumor_rna': [fsID, fsID] + +- 'gzipped': bool + +- 'patient_id': str + :rtype: dict """ job.fileStore.logToMaster('Downloading Inputs for %s' % univ_options['patient']) # For each sample type, check if the prefix is an S3 link or a regular file @@ -594,10 +584,10 @@ def get_fqs(job, fastqs, sample_type): """ Convenience function to return only a list of tumor_rna, tumor_dna or normal_dna fqs - :param job: job :param dict fastqs: dict of list of fq files :param str sample_type: key in sample_type to return :return: fastqs[sample_type] + :rtype: list[toil.fileStore.FileID] """ return fastqs[sample_type] @@ -605,6 +595,7 @@ def get_fqs(job, fastqs, sample_type): def generate_config_file(): """ Generate a config file for a ProTECT run on hg19. + :return: None """ shutil.copy(os.path.join(os.path.dirname(__file__), 'input_parameters.yaml'), @@ -613,7 +604,7 @@ def generate_config_file(): def main(): """ - This is the main function for the UCSC Precision Immuno pipeline. + This is the main function for ProTECT. """ parser = argparse.ArgumentParser(prog='ProTECT', description='Prediction of T-Cell Epitopes for Cancer Therapy', diff --git a/src/protect/qc/__init__.py b/src/protect/qc/__init__.py index c592ef89..6656c66f 100644 --- a/src/protect/qc/__init__.py +++ b/src/protect/qc/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/src/protect/qc/rna.py b/src/protect/qc/rna.py index 0c8a76c4..68e67db4 100644 --- a/src/protect/qc/rna.py +++ b/src/protect/qc/rna.py @@ -14,6 +14,7 @@ # limitations under the License. from __future__ import print_function from math import ceil + from protect.common import docker_call, docker_path, get_files_from_filestore, is_gzipfile import os @@ -26,24 +27,13 @@ def cutadapt_disk(rna_fastqs): def run_cutadapt(job, fastqs, univ_options, cutadapt_options): """ - This module runs cutadapt on the input RNA fastq files and then calls the RNA aligners. - - ARGUMENTS - 1. fastqs: List of input RNA-Seq fastqs [ , ] - 2. univ_options: Dict of universal arguments used by almost all tools - univ_options - +- 'dockerhub': - 3. cutadapt_options: Dict of parameters specific to cutadapt - cutadapt_options - |- 'a': - +- 'A': - RETURN VALUES - 1. output_files: Dict of cutadapted fastqs - output_files - |- 'rna_cutadapt_1.fastq': - +- 'rna_cutadapt_2.fastq': + Runs cutadapt on the input RNA fastq files. - This module corresponds to node 2 on the tree + :param list fastqs: List of fsIDs for input an RNA-Seq fastq pair + :param dict univ_options: Dict of universal options used by almost all tools + :param dict cutadapt_options: Options specific to cutadapt + :return: List of fsIDs of cutadapted fastqs + :rtype: list[toil.fileStore.FileID] """ job.fileStore.logToMaster('Running cutadapt on %s' % univ_options['patient']) work_dir = os.getcwd() diff --git a/src/protect/rankboost.py b/src/protect/rankboost.py index d773ff02..c5e0473e 100644 --- a/src/protect/rankboost.py +++ b/src/protect/rankboost.py @@ -20,15 +20,20 @@ def wrap_rankboost(job, rsem_files, merged_mhc_calls, transgene_out, univ_options, rankboost_options): """ - This is a convenience function that runs rankboost on pipeline outputs. + A wrapper for boost_ranks. - :param job job: job - :param dict rsem_files: dict of results from rsem - :param dict merged_mhc_calls: dict of results from merging mhc peptide binding predictions - :param dict transgene_out: dict of results from running transgene - :param dict univ_options: Universal Options - :param dict rankboost_options: Options specific to rank boost - :return: + :param dict rsem_files: Dict of results from rsem + :param dict merged_mhc_calls: Dict of results from merging mhc peptide binding predictions + :param dict transgene_out: Dict of results from running Transgene + :param dict univ_options: Dict of universal options used by almost all tools + :param dict rankboost_options: Options specific to rankboost + :return: Dict of concise and detailed results for mhci and mhcii + output_files: + |- 'mhcii_rankboost_concise_results.tsv': fsID + |- 'mhcii_rankboost_detailed_results.txt': fsID + |- 'mhci_rankboost_concise_results.tsv': fsID + +- 'mhci_rankboost_detailed_results.txt': fsID + :rtype: dict """ rankboost = job.addChildJobFn(boost_ranks, rsem_files['rsem.isoforms.results'], merged_mhc_calls, transgene_out, univ_options, rankboost_options) @@ -39,10 +44,20 @@ def wrap_rankboost(job, rsem_files, merged_mhc_calls, transgene_out, univ_option def boost_ranks(job, isoform_expression, merged_mhc_calls, transgene_out, univ_options, rankboost_options): """ - This is the final module in the pipeline. It will call the rank boosting R - script. + Boost the ranks of the predicted peptides:MHC combinations. - This module corresponds to node 21 in the tree + :param toil.fileStore.FileID isoform_expression: fsID of rsem isoform expression file + :param dict merged_mhc_calls: Dict of results from merging mhc peptide binding predictions + :param dict transgene_out: Dict of results from running Transgene + :param dict univ_options: Dict of universal options used by almost all tools + :param dict rankboost_options: Options specific to rankboost + :return: Dict of concise and detailed results for mhci and mhcii + output_files: + |- 'mhcii_rankboost_concise_results.tsv': fsID + |- 'mhcii_rankboost_detailed_results.txt': fsID + |- 'mhci_rankboost_concise_results.tsv': fsID + +- 'mhci_rankboost_detailed_results.txt': fsID + :rtype: dict """ job.fileStore.logToMaster('Running boost_ranks on %s' % univ_options['patient']) work_dir = os.getcwd() diff --git a/src/protect/test/ci/test_protect.py b/src/protect/test/ci/test_protect.py index ca7d97dc..0b4c6db4 100644 --- a/src/protect/test/ci/test_protect.py +++ b/src/protect/test/ci/test_protect.py @@ -61,7 +61,8 @@ def _test_ran_successfully(self): [], ['normal_dna_fix_pg_sorted.bam', 'normal_dna_fix_pg_sorted.bam.bai', 'rna_fix_pg_sorted.bam', 'rna_fix_pg_sorted.bam.bai', - 'tumor_dna_fix_pg_sorted.bam', 'tumor_dna_fix_pg_sorted.bam.bai']), + 'rna_transcriptome.bam', 'tumor_dna_fix_pg_sorted.bam', + 'tumor_dna_fix_pg_sorted.bam.bai']), ('/mnt/ephemeral/done/TEST/binding_predictions', [], ['mhci_merged_files.list', 'mhcii_merged_files.list']),