diff --git a/MANUAL.md b/MANUAL.md index 146308a5..3442ba99 100644 --- a/MANUAL.md +++ b/MANUAL.md @@ -186,6 +186,8 @@ These describe options that are used universally by most tools/jobs in the workf java_Xmx: 20G -> The default Java heap space to be provided to tools. Per-tool heap space can be specified for some tools to override this value. + reference_build: hg19 -> The reference build used in this run. Can be hg19, + hg38, GRCh37 or GRCh38. sse_key: /path/to/master.key -> Used to create per-file SSE-C keys for decrypting S3 hosted input files. It is highly recommended that the files be uploaded to S3 using s3am using the @@ -275,8 +277,14 @@ be substituted with S3 links. Descriptions for creating all files can be found i version: 1.1.7 muse: version: 1.0rc_submission_b391201 - radia: - version: 398366ef07b5911d8082ed61cbf03d487a41f286 + radia: -> Radia uses perchrom bed files in + folders as references. + cosmic_beds: /path/to/radia_cosmic.tar.gz + dbsnp_beds: /path/to/radia_dbsnp.tar.gz + retrogene_beds: /path/to/radia_retrogenes.tar.gz + pseudogene_beds: /path/to/radia_pseudogenes.tar.gz + gencode_beds: /path/to/radia_gencode.tar.gz + version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03 somaticsniper: version: 1.0.4 samtools: -> pileup reads @@ -308,7 +316,7 @@ be substituted with S3 links. Descriptions for creating all files can be found i file must be made to follow the gencode format for fasta record names - version: 2.1.0 + version: 2.1.1 haplotyping: phlat: diff --git a/required_docker_tools.txt b/required_docker_tools.txt index 3e631cb4..3215dfeb 100644 --- a/required_docker_tools.txt +++ b/required_docker_tools.txt @@ -7,7 +7,7 @@ # Alignment cutadapt:1.9.1 bwa:0.7.9a - star:2.4.2a + star:2.5.2b starlong:2.4.2a # Alignment post @@ -22,8 +22,8 @@ # Mutation Calling mutect:1.1.7 - radia:398366ef07b5911d8082ed61cbf03d487a41f286 - filterradia:398366ef07b5911d8082ed61cbf03d487a41f286 + radia:bcda721fc1f9c28d8b9224c2f95c440759cd3a03 + filterradia:bcda721fc1f9c28d8b9224c2f95c440759cd3a03 muse:1.0rc_submission_b391201 somaticsniper:1.0.4 somaticsniper-addons:1.0.4 @@ -35,7 +35,7 @@ snpeff:3.6 # Mutation Translation - transgene:2.1.0 + transgene:2.1.1 # MHC:peptide binding prediction mhci:2.13 diff --git a/src/protect/alignment/dna.py b/src/protect/alignment/dna.py index fa4352d9..586d3578 100644 --- a/src/protect/alignment/dna.py +++ b/src/protect/alignment/dna.py @@ -112,7 +112,7 @@ def run_bwa(job, fastqs, sample_type, univ_options, bwa_options): parameters = ['mem', '-t', str(bwa_options['n']), '-v', '1', # Don't print INFO messages to the stderr - '/'.join([input_files['bwa_index'], 'hg19']), + '/'.join([input_files['bwa_index'], univ_options['ref']]), input_files['dna_1.fastq' + gz], input_files['dna_2.fastq' + gz]] with open(''.join([work_dir, '/', sample_type, '_aligned.sam']), 'w') as samfile: diff --git a/src/protect/expression_profiling/rsem.py b/src/protect/expression_profiling/rsem.py index 4b83fd63..b2ce9279 100644 --- a/src/protect/expression_profiling/rsem.py +++ b/src/protect/expression_profiling/rsem.py @@ -80,7 +80,7 @@ def run_rsem(job, rna_bam, univ_options, rsem_options): '--bam', input_files['star_transcriptome.bam'], '--no-bam-output', - '/'.join([input_files['rsem_index'], 'hg19']), + '/'.join([input_files['rsem_index'], univ_options['ref']]), 'rsem'] docker_call(tool='rsem', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=rsem_options['version']) diff --git a/src/protect/mutation_annotation/snpeff.py b/src/protect/mutation_annotation/snpeff.py index eafdecdf..151fdbd2 100644 --- a/src/protect/mutation_annotation/snpeff.py +++ b/src/protect/mutation_annotation/snpeff.py @@ -56,13 +56,14 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): parameters = ['eff', '-dataDir', input_files['snpeff_index'], - '-c', '/'.join([input_files['snpeff_index'], 'snpEff_hg19_gencode.config']), + '-c', '/'.join([input_files['snpeff_index'], 'snpEff_' + univ_options['ref'] + + '_gencode.config']), '-no-intergenic', '-no-downstream', '-no-upstream', # '-canon', '-noStats', - 'hg19_gencode', + univ_options['ref'] + '_gencode', input_files['merged_mutations.vcf']] xmx = snpeff_options['java_Xmx'] if snpeff_options['java_Xmx'] else univ_options['java_Xmx'] with open('/'.join([work_dir, 'mutations.vcf']), 'w') as snpeff_file: diff --git a/src/protect/mutation_calling/radia.py b/src/protect/mutation_calling/radia.py index 239f5740..1456f873 100644 --- a/src/protect/mutation_calling/radia.py +++ b/src/protect/mutation_calling/radia.py @@ -167,7 +167,7 @@ def run_radia_perchrom(job, bams, univ_options, radia_options, chrom): ''.join(['--rnaTumorFasta=', input_files['genome.fa']]), '-f', input_files['genome.fa'], '-o', docker_path(radia_output), - '-i', 'hg19_M_rCRS', + '-i', univ_options['ref'], '-m', input_files['genome.fa'], '-d', 'aarjunrao@soe.ucsc.edu', '-q', 'Illumina', @@ -206,11 +206,20 @@ def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom): 'normal.bam.bai': bams['normal_dnai'], 'radia.vcf': radia_file, 'genome.fa.tar.gz': radia_options['genome_fasta'], - 'genome.fa.fai.tar.gz': radia_options['genome_fai']} + 'genome.fa.fai.tar.gz': radia_options['genome_fai'], + 'cosmic_beds': radia_options['cosmic_beds'], + 'dbsnp_beds': radia_options['dbsnp_beds'], + 'retrogene_beds': radia_options['retrogene_beds'], + 'pseudogene_beds': radia_options['pseudogene_beds'], + 'gencode_beds': radia_options['gencode_beds'] + } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) for key in ('genome.fa', 'genome.fa.fai'): input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir) + for key in ('cosmic_beds', 'dbsnp_beds', 'retrogene_beds', 'pseudogene_beds', 'gencode_beds'): + input_files[key] = untargz(input_files[key], work_dir) + input_files = {key: docker_path(path) for key, path in input_files.items()} filterradia_log = ''.join([work_dir, '/radia_filtered_', chrom, '_radia.log']) @@ -219,11 +228,11 @@ def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom): input_files['radia.vcf'], '/data', '/home/radia/scripts', - '-d', '/home/radia/data/hg19/snp135', - '-r', '/home/radia/data/hg19/retroGenes/', - '-p', '/home/radia/data/hg19/pseudoGenes/', - '-c', '/home/radia/data/hg19/cosmic/', - '-t', '/home/radia/data/hg19/gaf/2_1', + '-d', input_files['dbsnp_beds'], + '-r', input_files['retrogene_beds'], + '-p', input_files['pseudogene_beds'], + '-c', input_files['cosmic_beds'], + '-t', input_files['gencode_beds'], '--noSnpEff', '--noBlacklist', '--noTargets', diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py index 529aba0d..5a45ee18 100644 --- a/src/protect/pipeline/ProTECT.py +++ b/src/protect/pipeline/ProTECT.py @@ -209,6 +209,12 @@ def _parse_config_file(job, config_file, max_cores=None): _ensure_set_contains(input_config[key], required_keys[key], key) if key == 'Universal_Options': univ_options.update(input_config['Universal_Options']) + if univ_options['reference_build'].lower() in ['hg19', 'grch37']: + univ_options['ref'] = 'hg19' + elif univ_options['reference_build'].lower() in ['hg38', 'grch38']: + univ_options['ref'] = 'hg38' + else: + raise ParameterError('reference_build can only be hg19, hg38, GRCh37 or GRCh38') assert univ_options['storage_location'].startswith(('Local', 'local', 'aws')) if univ_options['storage_location'] in ('Local', 'local'): assert os.path.isabs(univ_options['output_folder']), ('Needs to be absolute if ' @@ -479,7 +485,7 @@ def get_all_tool_inputs(job, tools): # If a file is of the type file, vcf, tar or fasta, it needs to be downloaded from # S3 if reqd, then written to job store. if option.split('_')[-1] in ['file', 'vcf', 'index', 'fasta', 'fai', 'idx', 'dict', - 'tbi']: + 'tbi', 'beds']: tools[tool][option] = job.addChildJobFn(get_pipeline_inputs, option, tools[tool][option]).rv() elif option == 'version': diff --git a/src/protect/pipeline/defaults.yaml b/src/protect/pipeline/defaults.yaml index 75b227cf..507d17e5 100644 --- a/src/protect/pipeline/defaults.yaml +++ b/src/protect/pipeline/defaults.yaml @@ -38,7 +38,7 @@ alignment: A: AGATCGGAAGAG version: 1.9.1 star: - version: 2.4.2a + version: 2.5.2b bwa: version: 0.7.9a post: @@ -58,7 +58,7 @@ mutation_calling: muse: version: 1.0rc_submission_b391201 radia: - version: 398366ef07b5911d8082ed61cbf03d487a41f286 + version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03 somaticsniper: version: 1.0.4 samtools: @@ -75,7 +75,7 @@ mutation_annotation: mutation_translation: transgene: - version: 2.1.0 + version: 2.1.1 haplotyping: phlat: diff --git a/src/protect/pipeline/input_parameters.yaml b/src/protect/pipeline/input_parameters.yaml index 2ac71d3e..c3ce4083 100644 --- a/src/protect/pipeline/input_parameters.yaml +++ b/src/protect/pipeline/input_parameters.yaml @@ -38,13 +38,14 @@ patients: # The paths can also be to directories on S3 as tumor_dna_fastq_1: S3://bucket/path/to/1.fastq.gz normal_dna_fastq_1: S3://bucket/path/to/1.fastq.gz - tumor_rna_fastq_1: https://s3-.awsamazon.com/bucket/path/to/1.fastq.gz + tumor_rna_fastq_1: https://S3-.awsamazon.com/bucket/path/to/1.fastq.gz # These are options that are used by most tools Universal_Options: dockerhub: aarjunrao java_Xmx: 20G + reference_build: hg19 # Acceptable options are hg19, hg38, GRCh37, GRCh38 sse_key: /path/to/master.key # Path to the AWS master key. Required if using AWS else optional sse_key_is_master: True # True or False. Required if using AWS else optional storage_location: Local # Local or aws: for where the output must go @@ -92,6 +93,11 @@ mutation_calling: muse: # version: 1.0rc_submission_b391201 radia: + cosmic_beds: S3://cgl-protect-data/hg19_references/radia_cosmic.tar.gz + dbsnp_beds: S3://cgl-protect-data/hg19_references/radia_dbsnp.tar.gz + retrogene_beds: S3://cgl-protect-data/hg19_references/radia_retrogenes.tar.gz + pseudogene_beds: S3://cgl-protect-data/hg19_references/radia_pseudogenes.tar.gz + gencode_beds: S3://cgl-protect-data/hg19_references/radia_gencode.tar.gz # version: 398366ef07b5911d8082ed61cbf03d487a41f286 somaticsniper: # version: 1.0.4 @@ -112,7 +118,7 @@ mutation_annotation: mutation_translation: transgene: gencode_peptide_fasta: S3://cgl-protect-data/hg19_references/gencode.v19.pc_translations_NOPARY.fa.tar.gz - # version: 2.1.0 + # version: 2.1.1 haplotyping: phlat: diff --git a/src/protect/pipeline/required_entries.yaml b/src/protect/pipeline/required_entries.yaml index e62b3999..52c4dab7 100644 --- a/src/protect/pipeline/required_entries.yaml +++ b/src/protect/pipeline/required_entries.yaml @@ -33,6 +33,7 @@ patients: Universal_Options: storage_location: output_folder: + reference_build: # These options are for each module. You probably don't need to change any of this! alignment: @@ -59,6 +60,13 @@ mutation_calling: dbsnp_tbi: strelka: config_file: + radia: + cosmic_beds: + dbsnp_beds: + retrogene_beds: + pseudogene_beds: + gencode_beds: + mutation_annotation: snpeff: diff --git a/src/protect/test/__init__.py b/src/protect/test/__init__.py index 52384c96..1689749e 100644 --- a/src/protect/test/__init__.py +++ b/src/protect/test/__init__.py @@ -97,7 +97,8 @@ def _getTestUnivOptions(self): 'storage_location': 'Local', 'dockerhub': 'aarjunrao', 'java_Xmx': '20G', - 'max_cores': 2} + 'max_cores': 2, + 'ref': 'hg19'} @classmethod def _projectRootPath(cls): diff --git a/src/protect/test/test_inputs/ci_parameters.yaml b/src/protect/test/test_inputs/ci_parameters.yaml index ffc5437f..0cf47d3f 100644 --- a/src/protect/test/test_inputs/ci_parameters.yaml +++ b/src/protect/test/test_inputs/ci_parameters.yaml @@ -40,7 +40,7 @@ patients: Universal_Options: dockerhub: aarjunrao java_Xmx: 20G - + reference_build: 'hg19' output_folder: /mnt/ephemeral/done storage_location: Local @@ -71,6 +71,13 @@ mutation_calling: dbsnp_tbi : S3://cgl-protect-data/ci_references/dbsnp_coding.vcf.gz.tbi mutect: java_Xmx : 2G + radia: + cosmic_beds: S3://cgl-protect-data/hg19_references/radia_cosmic.tar.gz + dbsnp_beds: S3://cgl-protect-data/hg19_references/radia_dbsnp.tar.gz + retrogene_beds: S3://cgl-protect-data/hg19_references/radia_retrogenes.tar.gz + pseudogene_beds: S3://cgl-protect-data/hg19_references/radia_pseudogenes.tar.gz + gencode_beds: S3://cgl-protect-data/hg19_references/radia_gencode.tar.gz + strelka: config_file: S3://cgl-protect-data/hg19_references/strelka_bwa_WXS_config.ini.tar.gz diff --git a/src/protect/test/unit/test_snpeff.py b/src/protect/test/unit/test_snpeff.py new file mode 100644 index 00000000..79473ac1 --- /dev/null +++ b/src/protect/test/unit/test_snpeff.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python2.7 +# Copyright 2016 Arjun Arkal Rao +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Author : Arjun Arkal Rao +Affiliation : UCSC BME, UCSC Genomics Institute +File : protect/test/test_snpeff.py +""" +from __future__ import print_function + +from protect.common import untargz +from protect.mutation_annotation.snpeff import run_snpeff +from protect.pipeline.ProTECT import _parse_config_file +from protect.test import ProtectTest +from toil.job import Job + +import os +import subprocess + + +class TestSnpeff(ProtectTest): + def setUp(self): + super(TestSnpeff, self).setUp() + test_dir = self._createTempDir() + self.options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) + self.options.logLevel = 'INFO' + self.options.workDir = test_dir + self.options.clean = 'always' + + def test_snpeff(self): + """ + Test the functionality of run_transgene + """ + univ_options = self._getTestUnivOptions() + univ_options['output_folder'] = '/mnt/ephemeral/done' + config_file = os.path.join(self._projectRootPath(), + 'src/protect/test/test_inputs/ci_parameters.yaml') + test_src_folder = os.path.join(self._projectRootPath(), 'src', 'protect', 'test') + a = Job.wrapJobFn(self._get_test_mutation_vcf) + b = Job.wrapJobFn(self._get_all_tools, config_file).encapsulate() + c = Job.wrapJobFn(self._get_tool, b.rv(), 'snpeff') + d = Job.wrapJobFn(run_snpeff, a.rv(), univ_options, c.rv(), disk='100M', + memory='100M', cores=1).encapsulate() + a.addChild(b) + b.addChild(c) + + a.addChild(d) + c.addChild(d) + Job.Runner.startToil(a, self.options) + + @staticmethod + def _get_all_tools(job, config_file): + sample_set, univ_options, tool_options = _parse_config_file(job, config_file, + max_cores=None) + return tool_options + + @staticmethod + def _get_tool(job, all_tools, tool): + return all_tools[tool] + + @staticmethod + def _get_test_mutation_vcf(job): + """ + Get the test mutation vcf file and write to jobstore + + :return: FSID for the mutations vcf + """ + base_call = 's3am download s3://cgl-protect-data/unit_results/mutations/merged/' + filename = 'all_merged.vcf' + call = (base_call + ('%s.tar.gz ' % filename)*2).strip().split(' ') + subprocess.check_call(call) + untargz(filename + '.tar.gz', os.getcwd()) + return job.fileStore.writeGlobalFile(filename) + + +# noinspection PyProtectedMember +_get_all_tools = TestSnpeff._get_all_tools +# noinspection PyProtectedMember +_get_tool = TestSnpeff._get_tool +# noinspection PyProtectedMember +_get_test_mutation_vcf = TestSnpeff._get_test_mutation_vcf \ No newline at end of file diff --git a/src/protect/test/unit/test_transgene b/src/protect/test/unit/test_transgene new file mode 100644 index 00000000..44b5acc0 --- /dev/null +++ b/src/protect/test/unit/test_transgene @@ -0,0 +1,113 @@ +#!/usr/bin/env python2.7 +# Copyright 2016 Arjun Arkal Rao +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Author : Arjun Arkal Rao +Affiliation : UCSC BME, UCSC Genomics Institute +File : protect/test/test_transgene.py +""" +from __future__ import print_function + +from protect.common import untargz +from protect.mutation_translation import run_transgene +from protect.pipeline.ProTECT import _parse_config_file +from protect.test import ProtectTest +from toil.job import Job + +import os +import subprocess + + +class TestTransgene(ProtectTest): + def setUp(self): + super(TestTransgene, self).setUp() + test_dir = self._createTempDir() + self.options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) + self.options.logLevel = 'INFO' + self.options.workDir = test_dir + self.options.clean = 'always' + + def test_transgene(self): + """ + Test the functionality of run_transgene + """ + univ_options = self._getTestUnivOptions() + univ_options['output_folder'] = '/mnt/ephemeral/done' + config_file = os.path.join(self._projectRootPath(), + 'src/protect/test/test_inputs/ci_parameters.yaml') + test_src_folder = os.path.join(self._projectRootPath(), 'src', 'protect', 'test') + a = Job.wrapJobFn(self._get_test_snpeff_file) + b = Job.wrapJobFn(self._get_test_rna_bam) + c = Job.wrapJobFn(self._get_all_tools, config_file).encapsulate() + d = Job.wrapJobFn(self._get_tool, c.rv(), 'transgene') + e = Job.wrapJobFn(run_transgene, a.rv(), b.rv(), univ_options, d.rv(), disk='100M', + memory='100M', cores=1).encapsulate() + a.addChild(b) + b.addChild(c) + c.addChild(d) + + a.addChild(e) + b.addChild(e) + d.addChild(e) + Job.Runner.startToil(a, self.options) + + @staticmethod + def _get_all_tools(job, config_file): + sample_set, univ_options, tool_options = _parse_config_file(job, config_file, + max_cores=None) + return tool_options + + @staticmethod + def _get_tool(job, all_tools, tool): + return all_tools[tool] + + @staticmethod + def _get_test_snpeff_file(job): + """ + Get the test snpeffed vcf file and write to jobstore + + :return: FSID for the snpeffed vcf file + """ + base_call = 's3am download s3://cgl-protect-data/unit_results/mutations/snpeffed/' + filename = 'mutations.vcf' + call = (base_call + ('%s.tar.gz ' % filename)*2).strip().split(' ') + subprocess.check_call(call) + untargz(filename + '.tar.gz', os.getcwd()) + return job.fileStore.writeGlobalFile(filename) + + @staticmethod + def _get_test_rna_bam(job): + """ + Get the test phlat file and write to jobstore + + :return: FSID for the phlat file + """ + base_call = 's3am download s3://cgl-protect-data/unit_results/alignments/' + rna_files = {} + for filename in ['rna_fix_pg_sorted.bam', 'rna_fix_pg_sorted.bam.bai']: + call = (base_call + ('%s.tar.gz ' % filename) * 2).strip().split(' ') + subprocess.check_call(call) + untargz(filename + '.tar.gz', os.getcwd()) + rna_files[filename] = job.fileStore.writeGlobalFile(filename) + return {'rnaAligned.sortedByCoord.out.bam': rna_files} + +# noinspection PyProtectedMember +_get_all_tools = TestTransgene._get_all_tools +# noinspection PyProtectedMember +_get_tool = TestTransgene._get_tool +# noinspection PyProtectedMember +_get_test_snpeff_file = TestTransgene._get_test_snpeff_file +# noinspection PyProtectedMember +_get_test_rna_bam = TestTransgene._get_test_rna_bam \ No newline at end of file