Skip to content

Commit

Permalink
add workflows for isnvs_merge_to_vcf (#864)
Browse files Browse the repository at this point in the history
* add workflows for isnvs_merge_to_vcf and isnvs_merge_to_vcf_filtered

* consolidate isnv workflows

* include multiple alignment as part of isnv merge_to_vcf

* isnvs_per_sample: also remove ".mapped" from bam file used to infer sample name

* make emaill address optional for isnvs_vcf WDL (and for snpEff)

* infer snpEff accessions from ref fasta if not provided

* wdl corrections

* WIP

* chain commands with  &&; accession parse correction

* set -ex -o pipefail

* comment change

* update snpEff 4.1l -> 4.3.1t

* add snpEff integration test

* comment out custom tmpdir test fixtures

* try older tmpdir fixtures

* replace tmpdir_function fixture with tempfile.gettempdir()

* revert; incorporate changes from 0191d68 (@notestaff)

* only check final snp_eff tabular output

* disable stdout capture by pytest for snpeff test

* ignore BOM in open_or_gzopen

https://en.wikipedia.org/wiki/Byte_order_mark

* typo correction

* debug

* io open

* kwarg reorder

* debug

* revert utf-8-sig

* remove debug

* correct git merge butchery

* skip test_snpeff for now

errors occur on Travis that so far have not been reproducible locally

* dev notes

* allow passage of stderr handle in run_and_print

allow passage of stderr handle in run_and_print, defaulting to stdout redirection as before if not specified

* stderr=subprocess.STDOUT default in run_and_print

* py27 doesn't have subprocess.DEVNULL (thanks, @notestaff)

* correcting late night mistakes

* assertAlmostEqual for floats in snpEff test

* maintain consistent order of inferred sample name w/r/t isnv files

* WIP

* add --emailAddress syntax to snpEff snakemake rule

* WIP

* WIP

* remove test skipIf
  • Loading branch information
tomkinsc authored Jul 25, 2018
1 parent e2515d3 commit 03b8368
Show file tree
Hide file tree
Showing 25 changed files with 981 additions and 103 deletions.
1 change: 1 addition & 0 deletions DEVELOPMENT_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ When upgrading the GATK to a new version:
- in tools/gatk.py change TOOL_VERSION_TUPLE at the top
- in travis/install-gatk.sh change GATK_VERSION at the top
- in easy-deploy-script/easy-deploy-viral-ngs.sh
- in docker/rundocker.sh

### (Automated) testing
[Travis CI](https://travis-ci.org/broadinstitute/viral-ngs) performs automated unit and integration tests for viral-ngs on each branch and pull request. Unit tests are run on each new branch commit, and longer integration tests are performed on pull requests to help ensure the stability of the `master` branch. Pull requests are gated to ensure merging to `master` is allowed only if all tests pass. The Travis configuration is specified in `.travis.yml`, and relies on files stored within `viral-ngs/travis/`.
Expand Down
2 changes: 1 addition & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,4 @@ def pytest_terminal_summary(self, terminalreporter, exitstatus):
widths = [max(map(len, col)) for col in zip(*rows)]
for row in rows:
writer.write(" ".join((val.ljust(width) for val, width in zip(row, widths))))
writer.line()
writer.line()
2 changes: 1 addition & 1 deletion docker/rundocker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# A wrapper script to run viral-ngs docker images
# The following paths have to be modified according to end-user environment
NOVOALIGN_PATH="/opt/novocraft" # Directory where novoalign.lic license file can befound
GATK_PATH="/opt/GenomeAnalysisTK-3.6" # Directory where the correct GATK jar file can be found
GATK_PATH="/opt/GenomeAnalysisTK-3.8" # Directory where the correct GATK jar file can be found
IMAGE_HASH_OR_TAG="local/viral-ngs:1.16.0" # This can be found by running this command 'docker images'
DATA_DIR="$1"; shift
GID=$(id -g $USER)
Expand Down
1 change: 0 additions & 1 deletion illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,6 @@ def get_flowcell(self):
log.warn("The provided flowcell ID is longer than 15 characters. Is that correct?")
return fc

@util.misc.memoize
def _get_rundate_obj(self):
"""
Access the text of the <Date> node in the RunInfo.xml file
Expand Down
2 changes: 1 addition & 1 deletion interhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def parser_snpEff(parser=argparse.ArgumentParser()):
parser.add_argument("inVcf", help="Input VCF file")
parser.add_argument("genomes", nargs='+', help="genome name (snpEff db name, or NCBI accessions)")
parser.add_argument("outVcf", help="Output VCF file")
parser.add_argument("emailAddress",
parser.add_argument("--emailAddress",
help="""Your email address. To access the Genbank CoreNucleotide database,
NCBI requires you to specify your email address with each request.
In case of excessive usage of the E-utilities, NCBI will attempt to contact
Expand Down
37 changes: 27 additions & 10 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,31 @@ def merge_to_vcf(
guessed_samples = samplenames_from_isnvs + list(samplenames_from_alignments-(refnames|set(samplenames_from_isnvs)))
log.info("guessed sample names %s" % guessed_samples)

samples = samples or guessed_samples
samples = samples if samples is not None and len(samples)>0 else guessed_samples

samp_to_isnv = {}
# if we had to guess sample names, match them up to isnv files
if len(guessed_samples)>0:
matched_samples = []
matched_isnv_files = []
for sample in samples:
sample_found=False
for isnvs_file in isnvs:
for row in util.file.read_tabfile(isnvs_file):
if sample==sampleIDMatch(row[0]):
samp_to_isnv[sample] = isnvs_file
sample_found=True
matched_samples.append(sample)
matched_isnv_files.append(isnvs_file)
break
if sample_found:
break
samples = matched_samples
isnvs = matched_isnv_files
else:
samp_to_isnv = dict(zip(samples, isnvs))

log.info(samp_to_isnv)

# get IDs and sequence lengths for reference sequence
with util.file.open_or_gzopen(refFasta, 'r') as inf:
Expand Down Expand Up @@ -567,22 +591,14 @@ def merge_to_vcf(
# to the assemblies

# if we had to guess samples only check that the number of isnv files == number of alignments
if len(guessed_samples)>0:
if not (number_of_aligned_sequences - 1) == num_isnv_files:
raise LookupError(
"""The number of isnv files provided (%s) and must equal the number of sequences
seen in the alignment (%s) (plus an extra reference record in the alignment).
%s does not have the right number of sequences""" % (num_isnv_files,number_of_aligned_sequences - 1,fileName))
else:
if len(guessed_samples)==0:
if not (number_of_aligned_sequences - 1) == num_isnv_files == len(samples):
raise LookupError(
"""The number of isnv files provided (%s) and must equal the number of sequences
seen in the alignment (%s) (plus an extra reference record in the alignment),
as well as the number of sample names provided (%s)
%s does not have the right number of sequences""" % (num_isnv_files,number_of_aligned_sequences - 1,len(samples),fileName))

samp_to_isnv = dict(zip(samples, isnvs))

# one reference chrom at a time
with open(refFasta, 'r') as inf:
for ref_sequence in Bio.SeqIO.parse(inf, 'fasta'):
Expand Down Expand Up @@ -611,6 +627,7 @@ def merge_to_vcf(
for sampleName in samplesToUse:
if seq.id == sampleName:
samp_to_seqIndex[sampleName] = seq.seq.ungap('-')
break

if not len(samp_to_seqIndex) == len(samplesToUse):
raise LookupError(
Expand Down
18 changes: 18 additions & 0 deletions pipes/WDL/workflows/isnvs_merge_to_vcf.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import "tasks_interhost.wdl" as interhost
import "tasks_intrahost.wdl" as tasks_intrahost

workflow isnvs_merge_to_vcf {
File reference_fasta
Array[File]+ assemblies_fasta # one per genome
call interhost.multi_align_mafft_ref as mafft {
input:
reference_fasta = reference_fasta,
assemblies_fasta = assemblies_fasta
}
call tasks_intrahost.isnvs_vcf {
input:
perSegmentMultiAlignments = mafft.alignments_by_chr,
reference_fasta = reference_fasta
}
}
68 changes: 19 additions & 49 deletions pipes/WDL/workflows/tasks/tasks_intrahost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ task isnvs_per_sample {
Int? minReadsPerStrand
Int? maxBias

String sample_name = basename(basename(mapped_bam, ".bam"), ".all")
String sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")

command {
intrahost.py vphaser_one_sample \
Expand All @@ -29,61 +29,31 @@ task isnvs_per_sample {
}
}


task isnvs_vcf {
Array[File] vphaser2Calls # vphaser output; ex. vphaser2.${sample}.txt.gz
Array[File] perSegmentMultiAlignments # aligned_##.fasta, where ## is segment number
File reference_fasta

Array[String] snpEffRef # list of accessions to build/find snpEff database
Array[String]? snpEffRef # list of accessions to build/find snpEff database
Array[String]? sampleNames # list of sample names
String emailAddress # email address passed to NCBI if we need to download reference sequences
String? emailAddress # email address passed to NCBI if we need to download reference sequences
Boolean naiveFilter=false

command {
set -ex -o pipefail

SAMPLES="${sep=' ' sampleNames}"
if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi

intrahost.py merge_to_vcf \
${reference_fasta} \
isnvs.vcf.gz \
$SAMPLES \
--isnvs ${sep=' ' vphaser2Calls} \
--alignments ${sep=' ' perSegmentMultiAlignments} \
--strip_chr_version \
--parse_accession

interhost.py snpEff \
isnvs.vcf.gz \
${sep=' ' snpEffRef} \
isnvs.annot.vcf.gz \
${emailAddress}

intrahost.py iSNV_table \
isnvs.annot.vcf.gz \
isnvs.annot.txt.gz
}

output {
Array[File] isnvFiles = ["isnvs.vcf.gz", "isnvs.vcf.gz.tbi", "isnvs.annot.vcf.gz", "isnvs.annot.txt.gz", "isnvs.annot.vcf.gz.tbi"]
}
runtime {
memory: "4 GB"
docker: "quay.io/broadinstitute/viral-ngs"
}
}

task isnvs_vcf_filtered {
Array[File] vphaser2Calls # vphaser output; ex. vphaser2.${sample}.txt.gz
Array[File] perSegmentMultiAlignments # aligned_##.fasta, where ## is segment number
File reference_fasta
providedSnpRefAccessions="${sep=' ' snpEffRef}"
if [ -n "$providedSnpRefAccessions" ]; then
snpRefAccessions="$providedSnpRefAccessions";
else
snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${reference_fasta}', 'fasta'))))")"
fi

Array[String] snpEffRef # list of accessions to build/find snpEff database
Array[String]? sampleNames # list of sample names
String emailAddress # email address passed to NCBI if we need to download reference sequences
Boolean naiveFilter

command {
SAMPLES="${sep=' ' sampleNames}"
if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi
echo "snpRefAccessions: $snpRefAccessions"

intrahost.py merge_to_vcf \
${reference_fasta} \
Expand All @@ -92,18 +62,18 @@ task isnvs_vcf_filtered {
--isnvs ${sep=' ' vphaser2Calls} \
--alignments ${sep=' ' perSegmentMultiAlignments} \
--strip_chr_version \
${'--naive_filter' + naiveFilter} \
${true="--naive_filter" false="" naiveFilter} \
--parse_accession

interhost.py snpEff \
isnvs.vcf.gz \
${sep=' ' snpEffRef} \
$snpRefAccessions \
isnvs.annot.vcf.gz \
${emailAddress}
${'--emailAddress=' + emailAddress}

intrahost.py iSNV_table \
isnvs.annot.vcf.gz \
isnvs.annot.txt.gz \
isnvs.annot.txt.gz
}

output {
Expand Down
4 changes: 2 additions & 2 deletions pipes/rules/intrahost.rules
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ rule isnvs_vcf:
UGER = config.get('UGER_queues', {}).get('short', '-l h_rt=04:00:00'),
logid = "all",
snpEff_ref = " ".join(config["accessions_for_ref_genome_build"]),
email_address = config["email_point_of_contact_for_ncbi"]
email_address = "--emailAddress "+config["email_point_of_contact_for_ncbi"] if config["email_point_of_contact_for_ncbi"] else ""
run:
shell("{config[bin_dir]}/intrahost.py merge_to_vcf {input.ref_genome} {output.raw_vcf}"
+ " --samples " + " ".join(read_samples_file(config["samples_assembly"]))
Expand Down Expand Up @@ -92,7 +92,7 @@ rule isnvs_vcf_filtered:
UGER = config.get('UGER_queues', {}).get('short', '-l h_rt=04:00:00'),
logid = "all",
snpEff_ref = " ".join(config["accessions_for_ref_genome_build"]),
emailAddress = config["email_point_of_contact_for_ncbi"],
email_address = "--emailAddress "+config["email_point_of_contact_for_ncbi"] if config["email_point_of_contact_for_ncbi"] else "",
naiveFilter = "--naive_filter" if config["vcf_merge_naive_filter"] else ""
run:
shell("{config[bin_dir]}/intrahost.py merge_to_vcf {input.ref_genome} {output.raw_vcf}"
Expand Down
10 changes: 5 additions & 5 deletions read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,11 +754,11 @@ def main_reheader_bam(args):
''' Copy a BAM file (inBam to outBam) while renaming elements of the BAM header.
The mapping file specifies which (key, old value, new value) mappings. For
example:
LB lib1 lib_one
SM sample1 Sample_1
SM sample2 Sample_2
SM sample3 Sample_3
CN broad BI
LB lib1 lib_one
SM sample1 Sample_1
SM sample2 Sample_2
SM sample3 Sample_3
CN broad BI
'''
# read mapping file
mapper = dict((a + ':' + b, a + ':' + c) for a, b, c in util.file.read_tabfile(args.rgMap))
Expand Down
2 changes: 1 addition & 1 deletion requirements-conda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ picard=2.18.9
pigz=2.4
prinseq=0.20.4
samtools=1.7
snpeff=4.1l
snpeff=4.3.1t
spades=3.11.1
tbl2asn=25.6
trimmomatic=0.38
Expand Down
2 changes: 1 addition & 1 deletion test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,4 @@ def inputs(self, *fnames):
def assert_none_executable():
testDir = os.path.dirname(__file__)
assert all(not os.access(os.path.join(testDir, filename), os.X_OK) for filename in os.listdir(testDir)
if filename.endswith('.py'))
if filename.endswith('.py'))
Loading

0 comments on commit 03b8368

Please sign in to comment.