From 5ae429d3a1241c801258a1ed52541976d0a195e9 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Tue, 9 Aug 2022 15:27:51 +0100 Subject: [PATCH] Handle non-ACGT alleles and het genotypes --- requirements.txt | 2 +- tests/adjudicator_test.py | 11 +++++++- tests/data/adjudicator/run.calls.1.vcf | 6 ++-- tests/data/adjudicator/run.expect_final.vcf | 31 +++++++++++++++++++++ tests/data/adjudicator/run.make_files.sh | 7 +++-- 5 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 tests/data/adjudicator/run.expect_final.vcf diff --git a/requirements.txt b/requirements.txt index 8a81dcf..a999d75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -cluster_vcf_records >= 0.13.2 +cluster_vcf_records >= 0.13.3 matplotlib pyfastaq >= 3.14.0 pysam >= 0.12 diff --git a/tests/adjudicator_test.py b/tests/adjudicator_test.py index 77bbfca..3c82719 100644 --- a/tests/adjudicator_test.py +++ b/tests/adjudicator_test.py @@ -9,6 +9,11 @@ data_dir = os.path.join(this_dir, "data", "adjudicator") +def _load_vcf_record_lines(infile): + with open(infile) as f: + return [x.rstrip() for x in f if not x.startswith("#")] + + def test_get_gramtools_kmer_size(): """test _get_gramtools_kmer_size""" build_dir = os.path.join(data_dir, "get_gramtools_kmer_size.build") @@ -21,7 +26,7 @@ def test_get_gramtools_kmer_size(): def test_run_clean_is_false(): """test run when not cleaning up files afterwards""" # We're just testing that it doesn't crash. - # Check the output files exist, but not their contents. + # Check the output files exist, but not their contents (except VCF) # First run using splitting of VCF file. # Then run without splitting. outdir = "tmp.adjudicator.noclean.out" @@ -47,6 +52,10 @@ def test_run_clean_is_false(): assert os.path.exists(adj.log_file) assert os.path.exists(adj.final_vcf) assert os.path.exists(adj.clustered_vcf) + expect_vcf = os.path.join(data_dir, "run.expect_final.vcf") + expect_lines = _load_vcf_record_lines(expect_vcf) + got_lines = _load_vcf_record_lines(adj.final_vcf) + assert got_lines == expect_lines # Clean up and then run without splitting shutil.rmtree(outdir) diff --git a/tests/data/adjudicator/run.calls.1.vcf b/tests/data/adjudicator/run.calls.1.vcf index 3abd437..d72b363 100644 --- a/tests/data/adjudicator/run.calls.1.vcf +++ b/tests/data/adjudicator/run.calls.1.vcf @@ -36,7 +36,7 @@ ref.1 299 . GATA GA 225 . INDEL;IDV=30;IMF=0.967742;DP=31;VDB=0.258769;SGB=-0.69 ref.1 501 . G GAGTC 225 . INDEL;IDV=17;IMF=0.772727;DP=22;VDB=0.516335;SGB=-0.69168;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,9,10;MQ=60 GT:PL 1/1:255,57,0 ref.1 701 . C A 225 . DP=18;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0 ref.1 702 . T G 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0 -ref.1 703 . A T 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0 -ref.1 704 . A T 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0 +ref.1 703 . A T,G 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 0/1:255,48,0 +ref.1 704 . A T, 225 . DP=16;VDB=0.0438392;SGB=-0.689466;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,8,8;MQ=60 GT:PL 1/1:255,48,0 ref.1 900 . T G 42 . DP=11;VDB=0.40105;SGB=-0.616816;RPB=0.279932;MQB=0.503877;BQB=1.00775;MQ0F=0.636364;AC=2;AN=2;DP4=4,0,6,0;MQ=12 GT:PL 1/1:85,6,0 -ref.4 61 . C A . . DP=42 GT 1/1 +ref.4 61 . C A,T . . DP=42 GT 1/2 diff --git a/tests/data/adjudicator/run.expect_final.vcf b/tests/data/adjudicator/run.expect_final.vcf new file mode 100644 index 0000000..21508e7 --- /dev/null +++ b/tests/data/adjudicator/run.expect_final.vcf @@ -0,0 +1,31 @@ +##fileformat=VCFv4.2 +##source=minos, version 0.12.3 +##fileDate=2022-08-09 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##minosMeanReadDepth=16.857 +##minosReadDepthVariance=111.143 +##contig= +##contig= +##contig= +##contig= +##FORMAT= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT run.bwa.bam +ref.1 100 . G T . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:6:0,6:1.0:6:0,6:63.3:14.15 +ref.1 299 . GAT G . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:30:0,30:1.0:30:0,30:283.3:88.7 +ref.1 501 . G GAGTC . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:24.2:0,24.2:1.0:26:0,26:246.84:81.3 +ref.1 701 . C A . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:24:0,24:1.0:24:0,24:228.6:77.3 +ref.1 702 . T G . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:25:0,25:1.0:25:0,25:237.72:79.25 +ref.1 703 . A T . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:25:0,25:1.0:25:0,25:237.72:79.25 +ref.1 704 . A T . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 1/1:25:0,25:1.0:25:0,25:237.72:79.25 +ref.1 900 . T G . PASS . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE 0/0:13:13,0:1.0:13:13,0:127.96:42.7 +ref.4 61 . C A,T . . . GT:DP:ALLELE_DP:FRS:COV_TOTAL:COV:GT_CONF:GT_CONF_PERCENTILE ./.:.:0,0,0:.:0:0,0,0:0.0:0.0 diff --git a/tests/data/adjudicator/run.make_files.sh b/tests/data/adjudicator/run.make_files.sh index a7dae26..da17f7d 100755 --- a/tests/data/adjudicator/run.make_files.sh +++ b/tests/data/adjudicator/run.make_files.sh @@ -26,10 +26,13 @@ cat $pre.false_call_vcf.vcf >> $pre.calls.1.vcf awk -F"\t" '$2!=100' $pre.calls.1.vcf | awk -F"\t" 'BEGIN{OFS="\t"} {if($2==299) {$5="GA,CC"}; $1=$1; print $0}' > $pre.calls.2.vcf # add a variant to the reference that has no reads -echo -e "ref.4\t61\t.\tC\tA\t.\t.\tDP=42\tGT\t1/1" >> run.calls.1.vcf +echo -e "ref.4\t61\t.\tC\tA,T\t.\t.\tDP=42\tGT\t1/2" >> run.calls.1.vcf # add negative coord to vcf 1 grep '^#' $pre.calls.1.vcf > $pre.calls.1.vcf.$$ echo -e "ref.1\t-1\t.\tA\tG\t61\t.\tDP=7;VDB=0.40105;SGB=-0.616816;MQ0F=0.428571;AC=2;AN=2;DP4=0,0,6,0;MQ=20\tGT:PL\t1/1:88,18,0" >> $pre.calls.1.vcf.$$ grep -v '^#' $pre.calls.1.vcf >> $pre.calls.1.vcf.$$ -mv $pre.calls.1.vcf.$$ $pre.calls.1.vcf + +# change some calls to het instead of hom and add a non-ACGT allele +awk -F"\t" 'BEGIN{OFS="\t"} $2==703 {$5="T,G"; $10="0/1"substr($10,4)} $2==704 {$5="T,"} 1' $pre.calls.1.vcf.$$ > $pre.calls.1.vcf +rm $pre.calls.1.vcf.$$