diff --git a/src/main/java/genepi/nf/test/vcf/PathExtension.java b/src/main/java/genepi/nf/test/vcf/PathExtension.java index d0e6737..ba40d28 100644 --- a/src/main/java/genepi/nf/test/vcf/PathExtension.java +++ b/src/main/java/genepi/nf/test/vcf/PathExtension.java @@ -10,5 +10,10 @@ public static VcfFile getVcf(Path self) throws FileNotFoundException, IOExceptio return VcfFileUtil.load(self); } - + + public static VcfFile getVcf(Path self, String phasingDetectionStrategyName) throws IOException { + return VcfFileUtil.load(self, phasingDetectionStrategyName); + + } + } diff --git a/src/main/java/genepi/nf/test/vcf/VcfFileUtil.java b/src/main/java/genepi/nf/test/vcf/VcfFileUtil.java index 1528bbe..b7fe862 100644 --- a/src/main/java/genepi/nf/test/vcf/VcfFileUtil.java +++ b/src/main/java/genepi/nf/test/vcf/VcfFileUtil.java @@ -4,6 +4,8 @@ import java.nio.file.Path; import java.util.HashSet; import java.util.Set; +import java.util.function.BiFunction; +import java.util.function.Function; import genepi.io.text.LineReader; import htsjdk.variant.vcf.VCFFileReader; @@ -11,7 +13,63 @@ public class VcfFileUtil { + public enum PhasingDetectionStrategy { + DEFAULT( + header -> { + // By default, assume the VCF is phased. + return true; + }, + (tiles, previous) -> { + boolean phased = previous; + // If any of the variation is not phased, the whole VCF is neither. + if (phased) { + boolean containsSymbol = tiles[9].contains("/"); + + if (containsSymbol) { + phased = false; + } + } + return phased; + } + ), + ONE_FOR_ALL( + header -> { + // By default, assume the VCF is *not* phased. + return false; + }, + (tiles, previous) -> { + // If any of the variation is phased, the whole VCF is. + var phased = previous; + if (phased) { + return phased; + } + boolean containsSymbol = tiles[9].contains("/"); + return containsSymbol; + } + ), + ; + + public Function detectFromHeader; + + public BiFunction detectFromVariation; + + PhasingDetectionStrategy(Function detectFromHeader, BiFunction detectFromVariation) { + this.detectFromHeader = detectFromHeader; + this.detectFromVariation = detectFromVariation; + } + + + } + public static VcfFile load(Path vcfFilename) throws IOException { + return load(vcfFilename, PhasingDetectionStrategy.DEFAULT); + } + + public static VcfFile load(Path vcfFilename, String phasingDetectionStrategyName) throws IOException { + return load(vcfFilename, PhasingDetectionStrategy.valueOf(phasingDetectionStrategyName)); + } + + public static VcfFile load(Path vcfFilename, PhasingDetectionStrategy phasingDetectionStrategy) throws IOException { Set chromosomes = new HashSet(); int snps = 0; @@ -29,7 +87,7 @@ public static VcfFile load(Path vcfFilename) throws IOException { LineReader lineReader = new LineReader(vcfFilename.toString()); - boolean phased = true; + boolean phased = phasingDetectionStrategy.detectFromHeader.apply(header); boolean phasedAutodetect = true; boolean firstLine = true; while (lineReader.next()) { @@ -46,22 +104,15 @@ public static VcfFile load(Path vcfFilename) throws IOException { if (samples > 0) { - if (phased) { - boolean containsSymbol = tiles[9].contains("/"); - - if (containsSymbol) { - phased = false; - } - - } + phased = phasingDetectionStrategy.detectFromVariation.apply(tiles, phased); if (firstLine) { boolean containsSymbol = tiles[9].contains("/") || tiles[9].contains("."); - if (!containsSymbol) { - phasedAutodetect = true; - } else { + if (containsSymbol) { phasedAutodetect = false; + } else { + phasedAutodetect = true; } firstLine = false; diff --git a/tests.sh b/tests.sh index a881142..cb0d08a 100755 --- a/tests.sh +++ b/tests.sh @@ -1,2 +1,2 @@ -#!/bin/bash -nf-test test tests/test_vcf.nf.test --plugins target/nft-vcf-1.*.jar \ No newline at end of file +#!/usr/bin/env bash +nf-test test tests/test_vcf.nf.test --plugins target/nft-vcf-1.*.jar diff --git a/tests/data/valid/phased-and-unphased.vcf b/tests/data/valid/phased-and-unphased.vcf new file mode 100644 index 0000000..17d6c06 --- /dev/null +++ b/tests/data/valid/phased-and-unphased.vcf @@ -0,0 +1,247 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##DeepVariant_version=1.5.0 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##bcftools_filterVersion=1.18+htslib-1.18 +##bcftools_filterCommand=filter --output test_S1-filtered.vcf.gz --threads 2 --output-type z -i '((ILEN < 1024 && ILEN > -1024) || TYPE!~"indel") && FILTER="PASS"' test_S1.vcf.gz; Date=Wed Oct 9 08:28:58 2024 +##HiPhase_version="1.1.0-ae33ce1" +##HiPhase_command="hiphase --bam test_S1-aligned.bam --vcf test_S1-filtered.vcf.gz --output-vcf test_S1-phased.vcf.gz --output-bam test_S1-haplotagged.bam --reference GRCh38_GIABv3_no_alt_analysis_set_maskedGRC_decoys_MAP2K3_KMT2C_KCNJ18.fa --threads 2" +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_S1 +chrX 1189204 . A <*> 0 . END=1189206 GT:GQ:MIN_DP:PL 0/0:12:4:0,12,119 +chrX 1189207 . T G 5.8 PASS . GT:GQ:DP:AD:VAF:PL:PS 0|1:5:4:1,3:0.75:4,0,11:1189207 +chrX 1189208 . A T,<*> 5.4 PASS . GT:GQ:DP:AD:VAF:PL 0/1:5:4:0,3,0:0.75,0:3,0,10,990,990,990 +chrX 1189209 . G <*> 0 . END=1189209 GT:GQ:MIN_DP:PL ./.:0:4:17,0,77 +chrX 1189210 . T <*> 0 . END=1189210 GT:GQ:MIN_DP:PL 0/0:12:4:0,12,119 +chrX 1189211 . G <*> 0 . END=1189211 GT:GQ:MIN_DP:PL ./.:0:4:17,0,77 +chrX 1189212 . T <*> 0 . END=1189212 GT:GQ:MIN_DP:PL 0/0:12:4:0,12,119 diff --git a/tests/test_vcf.nf.test b/tests/test_vcf.nf.test index f918210..18dba64 100644 --- a/tests/test_vcf.nf.test +++ b/tests/test_vcf.nf.test @@ -294,5 +294,28 @@ nextflow_process { } } - + + test("Should use a specific phasing strategy") { + + when { + process { + """ + input[0] = file("${projectDir}/tests/data/valid/phased-and-unphased.vcf") + """ + } + } + + then { + def filename = process.out.vcf.get(0) + with(path(filename).getVcf()) { + assert !phased + } + with(path(filename).getVcf("ONE_FOR_ALL")) { + assert phased + } + + } + } + + }