Skip to content

Commit

Permalink
Merge pull request #170 from /issues/169-cleanup-code-base
Browse files Browse the repository at this point in the history
Cleanup codebase (resolves #169)
  • Loading branch information
arkal authored Mar 7, 2017
2 parents 32a4348 + 537648d commit 156844e
Show file tree
Hide file tree
Showing 30 changed files with 915 additions and 975 deletions.
28 changes: 18 additions & 10 deletions src/protect/addons.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function

from collections import Counter

from protect.common import export_results, get_files_from_filestore, untargz
from protect.haplotyping.phlat import parse_phlat_file

Expand All @@ -23,26 +25,30 @@

def run_mhc_gene_assessment(job, rsem_files, rna_haplotype, univ_options, mhc_genes_options):
"""
This is a convenience function that runs assess_mhc_genes.
:param job job:
A wrapper for assess_mhc_genes.
:param dict rsem_files: Results form running rsem
:param str rna_haplotype: The job store id for the rna haplotype file
:param dict univ_options: Universal Options
:param dict univ_options: Dict of universal options used by almost all tools
:param dict mhc_genes_options: Options specific to assessing the MHC genes
:return: The results of running assess_mhc_genes
:rtype: toil.fileStore.FileID
"""
return job.addChildJobFn(assess_mhc_genes, rsem_files['rsem.isoforms.results'], rna_haplotype,
univ_options, mhc_genes_options).rv()


def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_genes_options):
"""
This module will assess the prevalence of the various genes in the MHC pathway and return a
report in the tsv format
:param isoform_expression: Isoform expression from run_rsem
:param rna_haplotype: PHLAT output from running on rna
:param univ_options: Universal options for the pipeline
:param mhc_genes_options: options specific to this module
Assess the prevalence of the various genes in the MHC pathway and return a report in the tsv
format.
:param toil.fileStore.FileID isoform_expression: fsID for the rsem isoform expression file
:param toil.fileStore.FileID rna_haplotype: fsID for the RNA PHLAT file
:param dict univ_options: Dict of universal options used by almost all tools
:param dict mhc_genes_options: Options specific to assessing the MHC genes
:return: The fsID for the mhc pathway report file
:rtype: toil.fileStore.FileID
"""
job.fileStore.logToMaster('Running mhc gene assessment on %s' % univ_options['patient'])
work_dir = os.getcwd()
Expand Down Expand Up @@ -91,7 +97,9 @@ def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_g
for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRA', 'HLA_DRB'):
if mhcii_allele != 'HLA_DRA':
num_alleles = len(mhc_alleles[mhcii_allele])
result = 'FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS'
result = ('FAIL' if num_alleles == 0 else
'LOW' if num_alleles == 1 else
'PASS')
print("{:12}{:<12}{:<12}{:12}".format(mhcii_allele, 2, num_alleles, result),
file=mpr)
else:
Expand Down
19 changes: 10 additions & 9 deletions src/protect/alignment/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
from __future__ import absolute_import
from math import ceil
from protect.common import docker_call, get_files_from_filestore, export_results
from protect.common import docker_call, export_results, get_files_from_filestore

import os

Expand All @@ -28,14 +28,15 @@ def index_bamfile(job, bamfile, sample_type, univ_options, samtools_options):
"""
This module indexes BAMFILE
ARGUMENTS
1. bamfile: <JSid for a bam file>
2. sample_type: string of 'tumor_dna' or 'normal_dna'
3. univ_options: Dict of universal arguments used by almost all tools
univ_options
+- 'dockerhub': <dockerhub to use>
RETURN VALUES
1. output_files: REFER output_files in run_bwa(). This module is the one is
the one that generates the files.
:param toil.fileStore.FileID bamfile: fsID for the bam file
:param str sample_type: Description of the sample to inject into the filename
:param dict univ_options: Dict of universal options used by almost all tools
:param dict samtools_options: Options specific to samtools
:return: Dict containing input bam and the generated index (.bam.bai)
output_files:
|- '<sample_type>_fix_pg_sorted.bam': fsID
+- '<sample_type>_fix_pg_sorted.bam.bai': fsID
:rtype: dict
"""
job.fileStore.logToMaster('Running samtools-index on %s:%s' % (univ_options['patient'],
sample_type))
Expand Down
100 changes: 46 additions & 54 deletions src/protect/alignment/dna.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
from __future__ import absolute_import, print_function
from math import ceil

from protect.alignment.common import index_bamfile, index_disk
from protect.common import docker_call, docker_path, get_files_from_filestore, is_gzipfile, untargz
from toil.job import PromisedRequirement
Expand Down Expand Up @@ -41,7 +42,17 @@ def regroup_disk(reheader_bam):

def align_dna(job, fastqs, sample_type, univ_options, bwa_options):
"""
This is a convenience function that runs the entire dna alignment subgraph
A wrapper for the entire dna alignment subgraph.
:param list fastqs: The input fastqs for alignment
:param str sample_type: Description of the sample to inject into the filename
:param dict univ_options: Dict of universal options used by almost all tools
:param dict bwa_options: Options specific to bwa
:return: Dict containing output bam and bai
output_files:
|- '<sample_type>_fix_pg_sorted.bam': fsID
+- '<sample_type>_fix_pg_sorted.bam.bai': fsID
:rtype: dict
"""
bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options,
disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['index']),
Expand Down Expand Up @@ -69,28 +80,14 @@ def align_dna(job, fastqs, sample_type, univ_options, bwa_options):

def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
"""
This module aligns the SAMPLE_TYPE dna fastqs to the reference
ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor'/'normal'
1. fastqs: Dict of list of input WGS/WXS fastqs
fastqs
+- '<ST>_dna': [<JSid for 1.fastq> , <JSid for 2.fastq>]
2. sample_type: string of 'tumor_dna' or 'normal_dna'
3. univ_options: Dict of universal arguments used by almost all tools
univ_options
+- 'dockerhub': <dockerhub to use>
4. bwa_options: Dict of parameters specific to bwa
bwa_options
|- 'index': <JSid for the bwa index tarball>
+- 'n': <number of threads to allocate>
RETURN VALUES
1. output_files: Dict of aligned bam + reference (nested return)
output_files
|- '<ST>_fix_pg_sorted.bam': <JSid>
+- '<ST>_fix_pg_sorted.bam.bai': <JSid>
This module corresponds to nodes 3 and 4 on the tree
Align a pair of fastqs with bwa.
:param list fastqs: The input fastqs for alignment
:param str sample_type: Description of the sample to inject into the filename
:param dict univ_options: Dict of universal options used by almost all tools
:param dict bwa_options: Options specific to bwa
:return: fsID for the generated sam
:rtype: toil.fileStore.FileID
"""
job.fileStore.logToMaster('Running bwa on %s:%s' % (univ_options['patient'], sample_type))
work_dir = os.getcwd()
Expand Down Expand Up @@ -126,16 +123,14 @@ def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):

def bam_conversion(job, samfile, sample_type, univ_options, samtools_options):
"""
This module converts SAMFILE from sam to bam
ARGUMENTS
1. samfile: <JSid for a sam file>
2. sample_type: string of 'tumor_dna' or 'normal_dna'
3. univ_options: Dict of universal arguments used by almost all tools
univ_options
+- 'dockerhub': <dockerhub to use>
RETURN VALUES
1. output_files: REFER output_files in run_bwa()
Convert a sam to a bam.
:param dict samfile: The input sam file
:param str sample_type: Description of the sample to inject into the filename
:param dict univ_options: Dict of universal options used by almost all tools
:param dict samtools_options: Options specific to samtools
:return: fsID for the generated bam
:rtype: toil.fileStore.FileID
"""
job.fileStore.logToMaster('Running sam2bam on %s:%s' % (univ_options['patient'], sample_type))
work_dir = os.getcwd()
Expand All @@ -159,16 +154,15 @@ def bam_conversion(job, samfile, sample_type, univ_options, samtools_options):

def fix_bam_header(job, bamfile, sample_type, univ_options, samtools_options):
"""
This module modified the header in BAMFILE
ARGUMENTS
1. bamfile: <JSid for a bam file>
2. sample_type: string of 'tumor_dna' or 'normal_dna'
3. univ_options: Dict of universal arguments used by almost all tools
univ_options
+- 'dockerhub': <dockerhub to use>
RETURN VALUES
1. output_files: REFER output_files in run_bwa()
Fix the bam header to remove the command line call. Failing to do this causes Picard to reject
the bam.
:param dict bamfile: The input bam file
:param str sample_type: Description of the sample to inject into the filename
:param dict univ_options: Dict of universal options used by almost all tools
:param dict samtools_options: Options specific to samtools
:return: fsID for the output bam
:rtype: toil.fileStore.FileID
"""
job.fileStore.logToMaster('Running reheader on %s:%s' % (univ_options['patient'], sample_type))
work_dir = os.getcwd()
Expand Down Expand Up @@ -203,16 +197,14 @@ def fix_bam_header(job, bamfile, sample_type, univ_options, samtools_options):

def add_readgroups(job, bamfile, sample_type, univ_options, picard_options):
"""
This module adds the appropriate read groups to the bam file
ARGUMENTS
1. bamfile: <JSid for a bam file>
2. sample_type: string of 'tumor_dna' or 'normal_dna'
3. univ_options: Dict of universal arguments used by almost all tools
univ_options
|- 'dockerhub': <dockerhub to use>
+- 'java_Xmx': value for max heap passed to java
RETURN VALUES
1. output_files: REFER output_files in run_bwa()
Add read groups to the bam.
:param dict bamfile: The input bam file
:param str sample_type: Description of the sample to inject into the filename
:param dict univ_options: Dict of universal options used by almost all tools
:param dict picard_options: Options specific to picard
:return: fsID for the output bam
:rtype: toil.fileStore.FileID
"""
job.fileStore.logToMaster('Running add_read_groups on %s:%s' % (univ_options['patient'],
sample_type))
Expand All @@ -231,7 +223,7 @@ def add_readgroups(job, bamfile, sample_type, univ_options, picard_options):
'PU=12345',
''.join(['SM=', sample_type.rstrip('_dna')])]
docker_call(tool='picard', tool_parameters=parameters, work_dir=work_dir,
dockerhub=univ_options['dockerhub'], java_opts=univ_options['java_Xmx'],
dockerhub=univ_options['dockerhub'], java_xmx=univ_options['java_Xmx'],
tool_version=picard_options['version'])
output_file = job.fileStore.writeGlobalFile(
'/'.join([work_dir, sample_type + '_aligned_fixpg_sorted_reheader.bam']))
Expand Down
61 changes: 39 additions & 22 deletions src/protect/alignment/rna.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,14 @@
from __future__ import absolute_import, print_function
from collections import defaultdict
from math import ceil

from protect.alignment.common import index_bamfile, index_disk
from protect.common import docker_call, get_files_from_filestore, is_gzipfile, untargz, docker_path
from protect.common import (docker_call,
docker_path,
export_results,
get_files_from_filestore,
is_gzipfile,
untargz)
from toil.job import PromisedRequirement

import os
Expand All @@ -31,7 +37,13 @@ def star_disk(rna_fastqs, star_tar):

def align_rna(job, fastqs, univ_options, star_options):
"""
This is a convenience function that runs the entire rna alignment subgraph
A wrapper for the entire rna alignment subgraph.
:param list fastqs: The input fastqs for alignment
:param dict univ_options: Dict of universal options used by almost all tools
:param dict star_options: Options specific to star
:return: Dict containing input bam and the generated index (.bam.bai)
:rtype: dict
"""
star = job.wrapJobFn(run_star, fastqs, univ_options, star_options,
cores=star_options['n'],
Expand All @@ -47,26 +59,17 @@ def align_rna(job, fastqs, univ_options, star_options):

def run_star(job, fastqs, univ_options, star_options):
"""
This module uses STAR to align the RNA fastqs to the reference
ARGUMENTS
1. fastqs: REFER RETURN VALUE of run_cutadapt()
2. univ_options: Dict of universal arguments used by almost all tools
univ_options
+- 'dockerhub': <dockerhub to use>
3. star_options: Dict of parameters specific to STAR
star_options
|- 'index': <JSid for the STAR index tarball>
+- 'n': <number of threads to allocate>
RETURN VALUES
1. output_files: Dict of aligned bams
output_files
|- 'rnaAligned.toTranscriptome.out.bam': <JSid>
+- 'rnaAligned.sortedByCoord.out.bam': Dict of genome bam + bai
|- 'rna_fix_pg_sorted.bam': <JSid>
+- 'rna_fix_pg_sorted.bam.bai': <JSid>
Align a pair of fastqs with STAR.
This module corresponds to node 9 on the tree
:param list fastqs: The input fastqs for alignment
:param dict univ_options: Dict of universal options used by almost all tools
:param dict star_options: Options specific to star
:return: Dict containing output genome bam, genome bai, and transcriptome bam
output_files:
|- 'rnaAligned.toTranscriptome.out.bam': fsID
+- 'rnaAligned.sortedByCoord.out.bam':
+- 'rna_fix_pg_sorted.bam': fsID
:rtype: dict
"""
assert star_options['type'] in ('star', 'starlong')
job.fileStore.logToMaster('Running STAR on %s' % univ_options['patient'])
Expand Down Expand Up @@ -110,13 +113,27 @@ def run_star(job, fastqs, univ_options, star_options):
'rnaAligned.sortedByCoord.out.bam']:
output_files[bam_file] = job.fileStore.writeGlobalFile('/'.join([
work_dir, bam_file]))
export_results(job, output_files['rnaAligned.toTranscriptome.out.bam'], 'rna_transcriptome.bam',
univ_options, subfolder='alignments')
return output_files


def index_star(job, star_bams, univ_options, star_options):
"""
This is a wrapper functiion for index_bamfile in protect.common which is required since run_star
A wrapper for indexing the genomic star bam generated by run_star. It is required since run_star
returns a dict of 2 bams
:param dict star_bams: The bams from run_star
:param dict univ_options: Dict of universal options used by almost all tools
:param dict star_options: Options specific to star
:return: Dict containing input bam and the generated index (.bam.bai)
output_files:
|- 'rnaAligned.toTranscriptome.out.bam': fsID
+- 'rnaAligned.sortedByCoord.out.bam':
|- 'rna_fix_pg_sorted.bam': fsID
+- 'rna_fix_pg_sorted.bam.bai': fsID
:rtype: dict
"""
index = job.wrapJobFn(index_bamfile, star_bams['rnaAligned.sortedByCoord.out.bam'], 'rna',
univ_options, samtools_options=star_options['samtools'],
Expand Down
2 changes: 1 addition & 1 deletion src/protect/binding_prediction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import absolute_import
Loading

0 comments on commit 156844e

Please sign in to comment.