Skip to content

Commit

Permalink
Merge pull request #164 from /issues/161-provide-hg38-support
Browse files Browse the repository at this point in the history
Add hg38 support (resolves #161)
  • Loading branch information
arkal authored Mar 2, 2017
2 parents 401c96c + 73350d7 commit 32a4348
Show file tree
Hide file tree
Showing 14 changed files with 278 additions and 26 deletions.
14 changes: 11 additions & 3 deletions MANUAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ These describe options that are used universally by most tools/jobs in the workf
java_Xmx: 20G -> The default Java heap space to be provided to tools.
Per-tool heap space can be specified for some tools to
override this value.
reference_build: hg19 -> The reference build used in this run. Can be hg19,
hg38, GRCh37 or GRCh38.
sse_key: /path/to/master.key -> Used to create per-file SSE-C keys for decrypting S3
hosted input files. It is highly recommended that the
files be uploaded to S3 using s3am using the
Expand Down Expand Up @@ -275,8 +277,14 @@ be substituted with S3 links. Descriptions for creating all files can be found i
version: 1.1.7
muse:
version: 1.0rc_submission_b391201
radia:
version: 398366ef07b5911d8082ed61cbf03d487a41f286
radia: -> Radia uses perchrom bed files in
folders as references.
cosmic_beds: /path/to/radia_cosmic.tar.gz
dbsnp_beds: /path/to/radia_dbsnp.tar.gz
retrogene_beds: /path/to/radia_retrogenes.tar.gz
pseudogene_beds: /path/to/radia_pseudogenes.tar.gz
gencode_beds: /path/to/radia_gencode.tar.gz
version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
somaticsniper:
version: 1.0.4
samtools: -> pileup reads
Expand Down Expand Up @@ -308,7 +316,7 @@ be substituted with S3 links. Descriptions for creating all files can be found i
file must be made to follow the
gencode format for fasta record
names
version: 2.1.0
version: 2.1.1

haplotyping:
phlat:
Expand Down
8 changes: 4 additions & 4 deletions required_docker_tools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Alignment
cutadapt:1.9.1
bwa:0.7.9a
star:2.4.2a
star:2.5.2b
starlong:2.4.2a

# Alignment post
Expand All @@ -22,8 +22,8 @@

# Mutation Calling
mutect:1.1.7
radia:398366ef07b5911d8082ed61cbf03d487a41f286
filterradia:398366ef07b5911d8082ed61cbf03d487a41f286
radia:bcda721fc1f9c28d8b9224c2f95c440759cd3a03
filterradia:bcda721fc1f9c28d8b9224c2f95c440759cd3a03
muse:1.0rc_submission_b391201
somaticsniper:1.0.4
somaticsniper-addons:1.0.4
Expand All @@ -35,7 +35,7 @@
snpeff:3.6

# Mutation Translation
transgene:2.1.0
transgene:2.1.1

# MHC:peptide binding prediction
mhci:2.13
Expand Down
2 changes: 1 addition & 1 deletion src/protect/alignment/dna.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
parameters = ['mem',
'-t', str(bwa_options['n']),
'-v', '1', # Don't print INFO messages to the stderr
'/'.join([input_files['bwa_index'], 'hg19']),
'/'.join([input_files['bwa_index'], univ_options['ref']]),
input_files['dna_1.fastq' + gz],
input_files['dna_2.fastq' + gz]]
with open(''.join([work_dir, '/', sample_type, '_aligned.sam']), 'w') as samfile:
Expand Down
2 changes: 1 addition & 1 deletion src/protect/expression_profiling/rsem.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def run_rsem(job, rna_bam, univ_options, rsem_options):
'--bam',
input_files['star_transcriptome.bam'],
'--no-bam-output',
'/'.join([input_files['rsem_index'], 'hg19']),
'/'.join([input_files['rsem_index'], univ_options['ref']]),
'rsem']
docker_call(tool='rsem', tool_parameters=parameters, work_dir=work_dir,
dockerhub=univ_options['dockerhub'], tool_version=rsem_options['version'])
Expand Down
5 changes: 3 additions & 2 deletions src/protect/mutation_annotation/snpeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options):

parameters = ['eff',
'-dataDir', input_files['snpeff_index'],
'-c', '/'.join([input_files['snpeff_index'], 'snpEff_hg19_gencode.config']),
'-c', '/'.join([input_files['snpeff_index'], 'snpEff_' + univ_options['ref'] +
'_gencode.config']),
'-no-intergenic',
'-no-downstream',
'-no-upstream',
# '-canon',
'-noStats',
'hg19_gencode',
univ_options['ref'] + '_gencode',
input_files['merged_mutations.vcf']]
xmx = snpeff_options['java_Xmx'] if snpeff_options['java_Xmx'] else univ_options['java_Xmx']
with open('/'.join([work_dir, 'mutations.vcf']), 'w') as snpeff_file:
Expand Down
23 changes: 16 additions & 7 deletions src/protect/mutation_calling/radia.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def run_radia_perchrom(job, bams, univ_options, radia_options, chrom):
''.join(['--rnaTumorFasta=', input_files['genome.fa']]),
'-f', input_files['genome.fa'],
'-o', docker_path(radia_output),
'-i', 'hg19_M_rCRS',
'-i', univ_options['ref'],
'-m', input_files['genome.fa'],
'-d', '[email protected]',
'-q', 'Illumina',
Expand Down Expand Up @@ -206,11 +206,20 @@ def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom):
'normal.bam.bai': bams['normal_dnai'],
'radia.vcf': radia_file,
'genome.fa.tar.gz': radia_options['genome_fasta'],
'genome.fa.fai.tar.gz': radia_options['genome_fai']}
'genome.fa.fai.tar.gz': radia_options['genome_fai'],
'cosmic_beds': radia_options['cosmic_beds'],
'dbsnp_beds': radia_options['dbsnp_beds'],
'retrogene_beds': radia_options['retrogene_beds'],
'pseudogene_beds': radia_options['pseudogene_beds'],
'gencode_beds': radia_options['gencode_beds']
}
input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

for key in ('genome.fa', 'genome.fa.fai'):
input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
for key in ('cosmic_beds', 'dbsnp_beds', 'retrogene_beds', 'pseudogene_beds', 'gencode_beds'):
input_files[key] = untargz(input_files[key], work_dir)

input_files = {key: docker_path(path) for key, path in input_files.items()}

filterradia_log = ''.join([work_dir, '/radia_filtered_', chrom, '_radia.log'])
Expand All @@ -219,11 +228,11 @@ def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom):
input_files['radia.vcf'],
'/data',
'/home/radia/scripts',
'-d', '/home/radia/data/hg19/snp135',
'-r', '/home/radia/data/hg19/retroGenes/',
'-p', '/home/radia/data/hg19/pseudoGenes/',
'-c', '/home/radia/data/hg19/cosmic/',
'-t', '/home/radia/data/hg19/gaf/2_1',
'-d', input_files['dbsnp_beds'],
'-r', input_files['retrogene_beds'],
'-p', input_files['pseudogene_beds'],
'-c', input_files['cosmic_beds'],
'-t', input_files['gencode_beds'],
'--noSnpEff',
'--noBlacklist',
'--noTargets',
Expand Down
8 changes: 7 additions & 1 deletion src/protect/pipeline/ProTECT.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,12 @@ def _parse_config_file(job, config_file, max_cores=None):
_ensure_set_contains(input_config[key], required_keys[key], key)
if key == 'Universal_Options':
univ_options.update(input_config['Universal_Options'])
if univ_options['reference_build'].lower() in ['hg19', 'grch37']:
univ_options['ref'] = 'hg19'
elif univ_options['reference_build'].lower() in ['hg38', 'grch38']:
univ_options['ref'] = 'hg38'
else:
raise ParameterError('reference_build can only be hg19, hg38, GRCh37 or GRCh38')
assert univ_options['storage_location'].startswith(('Local', 'local', 'aws'))
if univ_options['storage_location'] in ('Local', 'local'):
assert os.path.isabs(univ_options['output_folder']), ('Needs to be absolute if '
Expand Down Expand Up @@ -479,7 +485,7 @@ def get_all_tool_inputs(job, tools):
# If a file is of the type file, vcf, tar or fasta, it needs to be downloaded from
# S3 if reqd, then written to job store.
if option.split('_')[-1] in ['file', 'vcf', 'index', 'fasta', 'fai', 'idx', 'dict',
'tbi']:
'tbi', 'beds']:
tools[tool][option] = job.addChildJobFn(get_pipeline_inputs, option,
tools[tool][option]).rv()
elif option == 'version':
Expand Down
6 changes: 3 additions & 3 deletions src/protect/pipeline/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ alignment:
A: AGATCGGAAGAG
version: 1.9.1
star:
version: 2.4.2a
version: 2.5.2b
bwa:
version: 0.7.9a
post:
Expand All @@ -58,7 +58,7 @@ mutation_calling:
muse:
version: 1.0rc_submission_b391201
radia:
version: 398366ef07b5911d8082ed61cbf03d487a41f286
version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
somaticsniper:
version: 1.0.4
samtools:
Expand All @@ -75,7 +75,7 @@ mutation_annotation:

mutation_translation:
transgene:
version: 2.1.0
version: 2.1.1

haplotyping:
phlat:
Expand Down
10 changes: 8 additions & 2 deletions src/protect/pipeline/input_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,14 @@ patients:
# The paths can also be to directories on S3 as
tumor_dna_fastq_1: S3://bucket/path/to/<tumor_dna_prefix>1.fastq.gz
normal_dna_fastq_1: S3://bucket/path/to/<tumor_dna_prefix>1.fastq.gz
tumor_rna_fastq_1: https://s3-<region>.awsamazon.com/bucket/path/to/<tumor_dna_prefix>1.fastq.gz
tumor_rna_fastq_1: https://S3-<region>.awsamazon.com/bucket/path/to/<tumor_dna_prefix>1.fastq.gz


# These are options that are used by most tools
Universal_Options:
dockerhub: aarjunrao
java_Xmx: 20G
reference_build: hg19 # Acceptable options are hg19, hg38, GRCh37, GRCh38
sse_key: /path/to/master.key # Path to the AWS master key. Required if using AWS else optional
sse_key_is_master: True # True or False. Required if using AWS else optional
storage_location: Local # Local or aws:<bucket_name> for where the output must go
Expand Down Expand Up @@ -92,6 +93,11 @@ mutation_calling:
muse:
# version: 1.0rc_submission_b391201
radia:
cosmic_beds: S3://cgl-protect-data/hg19_references/radia_cosmic.tar.gz
dbsnp_beds: S3://cgl-protect-data/hg19_references/radia_dbsnp.tar.gz
retrogene_beds: S3://cgl-protect-data/hg19_references/radia_retrogenes.tar.gz
pseudogene_beds: S3://cgl-protect-data/hg19_references/radia_pseudogenes.tar.gz
gencode_beds: S3://cgl-protect-data/hg19_references/radia_gencode.tar.gz
# version: 398366ef07b5911d8082ed61cbf03d487a41f286
somaticsniper:
# version: 1.0.4
Expand All @@ -112,7 +118,7 @@ mutation_annotation:
mutation_translation:
transgene:
gencode_peptide_fasta: S3://cgl-protect-data/hg19_references/gencode.v19.pc_translations_NOPARY.fa.tar.gz
# version: 2.1.0
# version: 2.1.1

haplotyping:
phlat:
Expand Down
8 changes: 8 additions & 0 deletions src/protect/pipeline/required_entries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ patients:
Universal_Options:
storage_location:
output_folder:
reference_build:

# These options are for each module. You probably don't need to change any of this!
alignment:
Expand All @@ -59,6 +60,13 @@ mutation_calling:
dbsnp_tbi:
strelka:
config_file:
radia:
cosmic_beds:
dbsnp_beds:
retrogene_beds:
pseudogene_beds:
gencode_beds:


mutation_annotation:
snpeff:
Expand Down
3 changes: 2 additions & 1 deletion src/protect/test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ def _getTestUnivOptions(self):
'storage_location': 'Local',
'dockerhub': 'aarjunrao',
'java_Xmx': '20G',
'max_cores': 2}
'max_cores': 2,
'ref': 'hg19'}

@classmethod
def _projectRootPath(cls):
Expand Down
9 changes: 8 additions & 1 deletion src/protect/test/test_inputs/ci_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ patients:
Universal_Options:
dockerhub: aarjunrao
java_Xmx: 20G

reference_build: 'hg19'
output_folder: /mnt/ephemeral/done
storage_location: Local

Expand Down Expand Up @@ -71,6 +71,13 @@ mutation_calling:
dbsnp_tbi : S3://cgl-protect-data/ci_references/dbsnp_coding.vcf.gz.tbi
mutect:
java_Xmx : 2G
radia:
cosmic_beds: S3://cgl-protect-data/hg19_references/radia_cosmic.tar.gz
dbsnp_beds: S3://cgl-protect-data/hg19_references/radia_dbsnp.tar.gz
retrogene_beds: S3://cgl-protect-data/hg19_references/radia_retrogenes.tar.gz
pseudogene_beds: S3://cgl-protect-data/hg19_references/radia_pseudogenes.tar.gz
gencode_beds: S3://cgl-protect-data/hg19_references/radia_gencode.tar.gz

strelka:
config_file: S3://cgl-protect-data/hg19_references/strelka_bwa_WXS_config.ini.tar.gz

Expand Down
93 changes: 93 additions & 0 deletions src/protect/test/unit/test_snpeff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python2.7
# Copyright 2016 Arjun Arkal Rao
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Author : Arjun Arkal Rao
Affiliation : UCSC BME, UCSC Genomics Institute
File : protect/test/test_snpeff.py
"""
from __future__ import print_function

from protect.common import untargz
from protect.mutation_annotation.snpeff import run_snpeff
from protect.pipeline.ProTECT import _parse_config_file
from protect.test import ProtectTest
from toil.job import Job

import os
import subprocess


class TestSnpeff(ProtectTest):
def setUp(self):
super(TestSnpeff, self).setUp()
test_dir = self._createTempDir()
self.options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
self.options.logLevel = 'INFO'
self.options.workDir = test_dir
self.options.clean = 'always'

def test_snpeff(self):
"""
Test the functionality of run_transgene
"""
univ_options = self._getTestUnivOptions()
univ_options['output_folder'] = '/mnt/ephemeral/done'
config_file = os.path.join(self._projectRootPath(),
'src/protect/test/test_inputs/ci_parameters.yaml')
test_src_folder = os.path.join(self._projectRootPath(), 'src', 'protect', 'test')
a = Job.wrapJobFn(self._get_test_mutation_vcf)
b = Job.wrapJobFn(self._get_all_tools, config_file).encapsulate()
c = Job.wrapJobFn(self._get_tool, b.rv(), 'snpeff')
d = Job.wrapJobFn(run_snpeff, a.rv(), univ_options, c.rv(), disk='100M',
memory='100M', cores=1).encapsulate()
a.addChild(b)
b.addChild(c)

a.addChild(d)
c.addChild(d)
Job.Runner.startToil(a, self.options)

@staticmethod
def _get_all_tools(job, config_file):
sample_set, univ_options, tool_options = _parse_config_file(job, config_file,
max_cores=None)
return tool_options

@staticmethod
def _get_tool(job, all_tools, tool):
return all_tools[tool]

@staticmethod
def _get_test_mutation_vcf(job):
"""
Get the test mutation vcf file and write to jobstore
:return: FSID for the mutations vcf
"""
base_call = 's3am download s3://cgl-protect-data/unit_results/mutations/merged/'
filename = 'all_merged.vcf'
call = (base_call + ('%s.tar.gz ' % filename)*2).strip().split(' ')
subprocess.check_call(call)
untargz(filename + '.tar.gz', os.getcwd())
return job.fileStore.writeGlobalFile(filename)


# noinspection PyProtectedMember
_get_all_tools = TestSnpeff._get_all_tools
# noinspection PyProtectedMember
_get_tool = TestSnpeff._get_tool
# noinspection PyProtectedMember
_get_test_mutation_vcf = TestSnpeff._get_test_mutation_vcf
Loading

0 comments on commit 32a4348

Please sign in to comment.