Merge pull request #170 from /issues/169-cleanup-code-base

Cleanup codebase (resolves #169)
BD2KGenomics · Mar 7, 2017 · 156844e · 156844e
2 parents 32a4348 + 537648d
commit 156844e
Show file tree

Hide file tree

Showing 30 changed files with 915 additions and 975 deletions.
diff --git a/src/protect/addons.py b/src/protect/addons.py
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+
 from collections import Counter
+
 from protect.common import export_results, get_files_from_filestore, untargz
 from protect.haplotyping.phlat import parse_phlat_file
 
@@ -23,26 +25,30 @@
 
 def run_mhc_gene_assessment(job, rsem_files, rna_haplotype, univ_options, mhc_genes_options):
     """
-    This is a convenience function that runs assess_mhc_genes.
-    :param job job:
+    A wrapper for assess_mhc_genes.
+
     :param dict rsem_files: Results form running rsem
     :param str rna_haplotype: The job store id for the rna haplotype file
-    :param dict univ_options: Universal Options
+    :param dict univ_options: Dict of universal options used by almost all tools
     :param dict mhc_genes_options: Options specific to assessing the MHC genes
     :return: The results of running assess_mhc_genes
+    :rtype: toil.fileStore.FileID
     """
     return job.addChildJobFn(assess_mhc_genes, rsem_files['rsem.isoforms.results'], rna_haplotype,
                              univ_options, mhc_genes_options).rv()
 
 
 def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_genes_options):
     """
-    This module will assess the prevalence of the various genes in the MHC pathway and return a
-    report in the tsv format
-    :param isoform_expression: Isoform expression from run_rsem
-    :param rna_haplotype: PHLAT output from running on rna
-    :param univ_options: Universal options for the pipeline
-    :param mhc_genes_options: options specific to this module
+    Assess the prevalence of the various genes in the MHC pathway and return a report in the tsv
+    format.
+
+    :param toil.fileStore.FileID isoform_expression: fsID for the rsem isoform expression file
+    :param toil.fileStore.FileID rna_haplotype: fsID for the RNA PHLAT file
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict mhc_genes_options: Options specific to assessing the MHC genes
+    :return: The fsID for the mhc pathway report file
+    :rtype: toil.fileStore.FileID
     """
     job.fileStore.logToMaster('Running mhc gene assessment on %s' % univ_options['patient'])
     work_dir = os.getcwd()
@@ -91,7 +97,9 @@ def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_g
                 for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRA', 'HLA_DRB'):
                     if mhcii_allele != 'HLA_DRA':
                         num_alleles = len(mhc_alleles[mhcii_allele])
-                        result = 'FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS'
+                        result = ('FAIL' if num_alleles == 0 else
+                                  'LOW' if num_alleles == 1 else
+                                  'PASS')
                         print("{:12}{:<12}{:<12}{:12}".format(mhcii_allele, 2, num_alleles, result),
                               file=mpr)
                     else:

diff --git a/src/protect/alignment/common.py b/src/protect/alignment/common.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 from __future__ import absolute_import
 from math import ceil
-from protect.common import docker_call, get_files_from_filestore, export_results
+from protect.common import docker_call,  export_results, get_files_from_filestore
 
 import os
 
@@ -28,14 +28,15 @@ def index_bamfile(job, bamfile, sample_type, univ_options, samtools_options):
     """
     This module indexes BAMFILE
     ARGUMENTS
-    1. bamfile: <JSid for a bam file>
-    2. sample_type: string of 'tumor_dna' or 'normal_dna'
-    3. univ_options: Dict of universal arguments used by almost all tools
-         univ_options
-                +- 'dockerhub': <dockerhub to use>
-    RETURN VALUES
-    1. output_files: REFER output_files in run_bwa(). This module is the one is
-                     the one that generates the files.
+    :param toil.fileStore.FileID bamfile: fsID for the bam file
+    :param str sample_type: Description of the sample to inject into the filename
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict samtools_options: Options specific to samtools
+    :return: Dict containing input bam and the generated index (.bam.bai)
+             output_files:
+                 |- '<sample_type>_fix_pg_sorted.bam': fsID
+                 +- '<sample_type>_fix_pg_sorted.bam.bai': fsID
+    :rtype: dict
     """
     job.fileStore.logToMaster('Running samtools-index on %s:%s' % (univ_options['patient'],
                                                                    sample_type))

diff --git a/src/protect/alignment/dna.py b/src/protect/alignment/dna.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from __future__ import absolute_import, print_function
 from math import ceil
+
 from protect.alignment.common import index_bamfile, index_disk
 from protect.common import docker_call, docker_path, get_files_from_filestore, is_gzipfile, untargz
 from toil.job import PromisedRequirement
@@ -41,7 +42,17 @@ def regroup_disk(reheader_bam):
 
 def align_dna(job, fastqs, sample_type, univ_options, bwa_options):
     """
-    This is a convenience function that runs the entire dna alignment subgraph
+    A wrapper for the entire dna alignment subgraph.
+
+    :param list fastqs: The input fastqs for alignment
+    :param str sample_type: Description of the sample to inject into the filename
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict bwa_options: Options specific to bwa
+    :return: Dict containing output bam and bai
+             output_files:
+                 |- '<sample_type>_fix_pg_sorted.bam': fsID
+                 +- '<sample_type>_fix_pg_sorted.bam.bai': fsID
+    :rtype: dict
     """
     bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options,
                         disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['index']),
@@ -69,28 +80,14 @@ def align_dna(job, fastqs, sample_type, univ_options, bwa_options):
 
 def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
     """
-    This module aligns the SAMPLE_TYPE dna fastqs to the reference
-
-    ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor'/'normal'
-    1. fastqs: Dict of list of input WGS/WXS fastqs
-         fastqs
-              +- '<ST>_dna': [<JSid for 1.fastq> , <JSid for 2.fastq>]
-    2. sample_type: string of 'tumor_dna' or 'normal_dna'
-    3. univ_options: Dict of universal arguments used by almost all tools
-         univ_options
-                +- 'dockerhub': <dockerhub to use>
-    4. bwa_options: Dict of parameters specific to bwa
-         bwa_options
-              |- 'index': <JSid for the bwa index tarball>
-              +- 'n': <number of threads to allocate>
-
-    RETURN VALUES
-    1. output_files: Dict of aligned bam + reference (nested return)
-         output_files
-             |- '<ST>_fix_pg_sorted.bam': <JSid>
-             +- '<ST>_fix_pg_sorted.bam.bai': <JSid>
-
-    This module corresponds to nodes 3 and 4 on the tree
+    Align a pair of fastqs with bwa.
+
+    :param list fastqs: The input fastqs for alignment
+    :param str sample_type: Description of the sample to inject into the filename
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict bwa_options: Options specific to bwa
+    :return: fsID for the generated sam
+    :rtype: toil.fileStore.FileID
     """
     job.fileStore.logToMaster('Running bwa on %s:%s' % (univ_options['patient'], sample_type))
     work_dir = os.getcwd()
@@ -126,16 +123,14 @@ def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
 
 def bam_conversion(job, samfile, sample_type, univ_options, samtools_options):
     """
-    This module converts SAMFILE from sam to bam
-
-    ARGUMENTS
-    1. samfile: <JSid for a sam file>
-    2. sample_type: string of 'tumor_dna' or 'normal_dna'
-    3. univ_options: Dict of universal arguments used by almost all tools
-         univ_options
-                +- 'dockerhub': <dockerhub to use>
-    RETURN VALUES
-    1. output_files: REFER output_files in run_bwa()
+    Convert a sam to a bam.
+
+    :param dict samfile: The input sam file
+    :param str sample_type: Description of the sample to inject into the filename
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict samtools_options: Options specific to samtools
+    :return: fsID for the generated bam
+    :rtype: toil.fileStore.FileID
     """
     job.fileStore.logToMaster('Running sam2bam on %s:%s' % (univ_options['patient'], sample_type))
     work_dir = os.getcwd()
@@ -159,16 +154,15 @@ def bam_conversion(job, samfile, sample_type, univ_options, samtools_options):
 
 def fix_bam_header(job, bamfile, sample_type, univ_options, samtools_options):
     """
-    This module modified the header in BAMFILE
-
-    ARGUMENTS
-    1. bamfile: <JSid for a bam file>
-    2. sample_type: string of 'tumor_dna' or 'normal_dna'
-    3. univ_options: Dict of universal arguments used by almost all tools
-         univ_options
-                +- 'dockerhub': <dockerhub to use>
-    RETURN VALUES
-    1. output_files: REFER output_files in run_bwa()
+    Fix the bam header to remove the command line call.  Failing to do this causes Picard to reject
+    the bam.
+
+    :param dict bamfile: The input bam file
+    :param str sample_type: Description of the sample to inject into the filename
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict samtools_options: Options specific to samtools
+    :return: fsID for the output bam
+    :rtype: toil.fileStore.FileID
     """
     job.fileStore.logToMaster('Running reheader on %s:%s' % (univ_options['patient'], sample_type))
     work_dir = os.getcwd()
@@ -203,16 +197,14 @@ def fix_bam_header(job, bamfile, sample_type, univ_options, samtools_options):
 
 def add_readgroups(job, bamfile, sample_type, univ_options, picard_options):
     """
-    This module adds the appropriate read groups to the bam file
-    ARGUMENTS
-    1. bamfile: <JSid for a bam file>
-    2. sample_type: string of 'tumor_dna' or 'normal_dna'
-    3. univ_options: Dict of universal arguments used by almost all tools
-         univ_options
-                |- 'dockerhub': <dockerhub to use>
-                +- 'java_Xmx': value for max heap passed to java
-    RETURN VALUES
-    1. output_files: REFER output_files in run_bwa()
+    Add read groups to the bam.
+
+    :param dict bamfile: The input bam file
+    :param str sample_type: Description of the sample to inject into the filename
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict picard_options: Options specific to picard
+    :return: fsID for the output bam
+    :rtype: toil.fileStore.FileID
     """
     job.fileStore.logToMaster('Running add_read_groups on %s:%s' % (univ_options['patient'],
                                                                     sample_type))
@@ -231,7 +223,7 @@ def add_readgroups(job, bamfile, sample_type, univ_options, picard_options):
                   'PU=12345',
                   ''.join(['SM=', sample_type.rstrip('_dna')])]
     docker_call(tool='picard', tool_parameters=parameters, work_dir=work_dir,
-                dockerhub=univ_options['dockerhub'], java_opts=univ_options['java_Xmx'],
+                dockerhub=univ_options['dockerhub'], java_xmx=univ_options['java_Xmx'],
                 tool_version=picard_options['version'])
     output_file = job.fileStore.writeGlobalFile(
         '/'.join([work_dir, sample_type + '_aligned_fixpg_sorted_reheader.bam']))

diff --git a/src/protect/alignment/rna.py b/src/protect/alignment/rna.py
@@ -15,8 +15,14 @@
 from __future__ import absolute_import, print_function
 from collections import defaultdict
 from math import ceil
+
 from protect.alignment.common import index_bamfile, index_disk
-from protect.common import docker_call, get_files_from_filestore, is_gzipfile, untargz, docker_path
+from protect.common import (docker_call,
+                            docker_path,
+                            export_results,
+                            get_files_from_filestore,
+                            is_gzipfile,
+                            untargz)
 from toil.job import PromisedRequirement
 
 import os
@@ -31,7 +37,13 @@ def star_disk(rna_fastqs, star_tar):
 
 def align_rna(job, fastqs, univ_options, star_options):
     """
-    This is a convenience function that runs the entire rna alignment subgraph
+    A wrapper for the entire rna alignment subgraph.
+
+    :param list fastqs: The input fastqs for alignment
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict star_options: Options specific to star
+    :return: Dict containing input bam and the generated index (.bam.bai)
+    :rtype: dict
     """
     star = job.wrapJobFn(run_star, fastqs, univ_options, star_options,
                          cores=star_options['n'],
@@ -47,26 +59,17 @@ def align_rna(job, fastqs, univ_options, star_options):
 
 def run_star(job, fastqs, univ_options, star_options):
     """
-    This module uses STAR to align the RNA fastqs to the reference
-
-    ARGUMENTS
-    1. fastqs: REFER RETURN VALUE of run_cutadapt()
-    2. univ_options: Dict of universal arguments used by almost all tools
-         univ_options
-              +- 'dockerhub': <dockerhub to use>
-    3. star_options: Dict of parameters specific to STAR
-         star_options
-             |- 'index': <JSid for the STAR index tarball>
-             +- 'n': <number of threads to allocate>
-    RETURN VALUES
-    1. output_files: Dict of aligned bams
-         output_files
-             |- 'rnaAligned.toTranscriptome.out.bam': <JSid>
-             +- 'rnaAligned.sortedByCoord.out.bam': Dict of genome bam + bai
-                                |- 'rna_fix_pg_sorted.bam': <JSid>
-                                +- 'rna_fix_pg_sorted.bam.bai': <JSid>
+    Align a pair of fastqs with STAR.
 
-    This module corresponds to node 9 on the tree
+    :param list fastqs: The input fastqs for alignment
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict star_options: Options specific to star
+    :return: Dict containing output genome bam, genome bai, and transcriptome bam
+                 output_files:
+                    |- 'rnaAligned.toTranscriptome.out.bam': fsID
+                    +- 'rnaAligned.sortedByCoord.out.bam':
+                                        +- 'rna_fix_pg_sorted.bam': fsID
+    :rtype: dict
     """
     assert star_options['type'] in ('star', 'starlong')
     job.fileStore.logToMaster('Running STAR on %s' % univ_options['patient'])
@@ -110,13 +113,27 @@ def run_star(job, fastqs, univ_options, star_options):
                      'rnaAligned.sortedByCoord.out.bam']:
         output_files[bam_file] = job.fileStore.writeGlobalFile('/'.join([
             work_dir, bam_file]))
+    export_results(job, output_files['rnaAligned.toTranscriptome.out.bam'], 'rna_transcriptome.bam',
+                   univ_options, subfolder='alignments')
     return output_files
 
 
 def index_star(job, star_bams, univ_options, star_options):
     """
-    This is a wrapper functiion for index_bamfile in protect.common which is required since run_star
+    A wrapper for indexing the genomic star bam generated by run_star. It is required since run_star
     returns a dict of 2 bams
+
+    :param dict star_bams: The bams from run_star
+    :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict star_options: Options specific to star
+    :return: Dict containing input bam and the generated index (.bam.bai)
+                     output_files:
+                        |- 'rnaAligned.toTranscriptome.out.bam': fsID
+                        +- 'rnaAligned.sortedByCoord.out.bam':
+                                        |- 'rna_fix_pg_sorted.bam': fsID
+                                        +- 'rna_fix_pg_sorted.bam.bai': fsID
+
+    :rtype: dict
     """
     index = job.wrapJobFn(index_bamfile, star_bams['rnaAligned.sortedByCoord.out.bam'], 'rna',
                           univ_options, samtools_options=star_options['samtools'],

diff --git a/src/protect/binding_prediction/__init__.py b/src/protect/binding_prediction/__init__.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import