Skip to content

Commit

Permalink
Handle non-ACGT alleles and het genotypes
Browse files Browse the repository at this point in the history
  • Loading branch information
martinghunt committed Aug 9, 2022
1 parent 27c4746 commit 5ae429d
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 7 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cluster_vcf_records >= 0.13.2
cluster_vcf_records >= 0.13.3
matplotlib
pyfastaq >= 3.14.0
pysam >= 0.12
Expand Down
11 changes: 10 additions & 1 deletion tests/adjudicator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
data_dir = os.path.join(this_dir, "data", "adjudicator")


def _load_vcf_record_lines(infile):
with open(infile) as f:
return [x.rstrip() for x in f if not x.startswith("#")]


def test_get_gramtools_kmer_size():
"""test _get_gramtools_kmer_size"""
build_dir = os.path.join(data_dir, "get_gramtools_kmer_size.build")
Expand All @@ -21,7 +26,7 @@ def test_get_gramtools_kmer_size():
def test_run_clean_is_false():
"""test run when not cleaning up files afterwards"""
# We're just testing that it doesn't crash.
# Check the output files exist, but not their contents.
# Check the output files exist, but not their contents (except VCF)
# First run using splitting of VCF file.
# Then run without splitting.
outdir = "tmp.adjudicator.noclean.out"
Expand All @@ -47,6 +52,10 @@ def test_run_clean_is_false():
assert os.path.exists(adj.log_file)
assert os.path.exists(adj.final_vcf)
assert os.path.exists(adj.clustered_vcf)
expect_vcf = os.path.join(data_dir, "run.expect_final.vcf")
expect_lines = _load_vcf_record_lines(expect_vcf)
got_lines = _load_vcf_record_lines(adj.final_vcf)
assert got_lines == expect_lines

# Clean up and then run without splitting
shutil.rmtree(outdir)
Expand Down
6 changes: 3 additions & 3 deletions tests/data/adjudicator/run.calls.1.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ ref.1 299 . GATA GA 225 . INDEL;IDV=30;IMF=0.967742;DP=31;VDB=0.258769;SGB=-0.69
ref.1 501 . G GAGTC 225 . INDEL;IDV=17;IMF=0.772727;DP=22;VDB=0.516335;SGB=-0.69168;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,9,10;MQ=60 GT:PL 1/1:255,57,0
ref.1 701 . C A 225 . DP=18;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0
ref.1 702 . T G 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0
ref.1 703 . A T 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0
ref.1 704 . A T 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0
ref.1 703 . A T,G 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 0/1:255,48,0
ref.1 704 . A T,<FOO> 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0
ref.1 900 . T G 42 . DP=11;VDB=0.40105;SGB=-0.616816;RPB=0.279932;MQB=0.503877;BQB=1.00775;MQ0F=0.636364;AC=2;AN=2;DP4=4,0,6,0;MQ=12 GT:PL 1/1:85,6,0
ref.4 61 . C A . . DP=42 GT 1/1
ref.4 61 . C A,T . . DP=42 GT 1/2
31 changes: 31 additions & 0 deletions tests/data/adjudicator/run.expect_final.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
##fileformat=VCFv4.2
##source=minos, version 0.12.3
##fileDate=2022-08-09
##FORMAT=<ID=ALLELE_DP,Number=R,Type=Float,Description="Mean read depth of ref and each allele">
##FORMAT=<ID=COV,Number=R,Type=Integer,Description="Number of reads on ref and alt alleles">
##FORMAT=<ID=COV_TOTAL,Number=1,Type=Integer,Description="Total reads mapped at this site, from gramtools">
##FORMAT=<ID=DP,Number=1,Type=Float,Description="Mean read depth of called allele (ie the ALLELE_DP of the called allele)">
##FORMAT=<ID=FRS,Number=1,Type=Float,Description="Fraction of reads that support the genotype call">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GT_CONF,Number=1,Type=Float,Description="Genotype confidence. Difference in log likelihood of most likely and next most likely genotype">
##minosMeanReadDepth=16.857
##minosReadDepthVariance=111.143
##contig=<ID=ref.1,length=1000>
##contig=<ID=ref.2,length=1000>
##contig=<ID=ref.3,length=1000>
##contig=<ID=ref.4,length=180>
##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">
##FILTER=<ID=MIN_FRS,Description="Minimum FRS of 0.9">
##FILTER=<ID=MIN_DP,Description="Minimum DP of 0">
##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of 5">
##FILTER=<ID=MAX_DP,Description="Maximum DP of 48.484314144580786 (= 3.0 standard deviations from the mean read depth 16.857)">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT run.bwa.bam
ref.1 100 . G T . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:6:0,6:1.0:6:0,6:63.3:14.15
ref.1 299 . GAT G . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:30:0,30:1.0:30:0,30:283.3:88.7
ref.1 501 . G GAGTC . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:24.2:0,24.2:1.0:26:0,26:246.84:81.3
ref.1 701 . C A . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:24:0,24:1.0:24:0,24:228.6:77.3
ref.1 702 . T G . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:25:0,25:1.0:25:0,25:237.72:79.25
ref.1 703 . A T . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:25:0,25:1.0:25:0,25:237.72:79.25
ref.1 704 . A T . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:25:0,25:1.0:25:0,25:237.72:79.25
ref.1 900 . T G . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 0/0:13:13,0:1.0:13:13,0:127.96:42.7
ref.4 61 . C A,T . . . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE ./.:.:0,0,0:.:0:0,0,0:0.0:0.0
7 changes: 5 additions & 2 deletions tests/data/adjudicator/run.make_files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,13 @@ cat $pre.false_call_vcf.vcf >> $pre.calls.1.vcf
awk -F"\t" '$2!=100' $pre.calls.1.vcf | awk -F"\t" 'BEGIN{OFS="\t"} {if($2==299) {$5="GA,CC"}; $1=$1; print $0}' > $pre.calls.2.vcf

# add a variant to the reference that has no reads
echo -e "ref.4\t61\t.\tC\tA\t.\t.\tDP=42\tGT\t1/1" >> run.calls.1.vcf
echo -e "ref.4\t61\t.\tC\tA,T\t.\t.\tDP=42\tGT\t1/2" >> run.calls.1.vcf

# add negative coord to vcf 1
grep '^#' $pre.calls.1.vcf > $pre.calls.1.vcf.$$
echo -e "ref.1\t-1\t.\tA\tG\t61\t.\tDP=7;VDB=0.40105;SGB=-0.616816;MQ0F=0.428571;AC=2;AN=2;DP4=0,0,6,0;MQ=20\tGT:PL\t1/1:88,18,0" >> $pre.calls.1.vcf.$$
grep -v '^#' $pre.calls.1.vcf >> $pre.calls.1.vcf.$$
mv $pre.calls.1.vcf.$$ $pre.calls.1.vcf

# change some calls to het instead of hom and add a non-ACGT allele
awk -F"\t" 'BEGIN{OFS="\t"} $2==703 {$5="T,G"; $10="0/1"substr($10,4)} $2==704 {$5="T,<FOO>"} 1' $pre.calls.1.vcf.$$ > $pre.calls.1.vcf
rm $pre.calls.1.vcf.$$

0 comments on commit 5ae429d

Please sign in to comment.