Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phase detection strategies #12

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/main/java/genepi/nf/test/vcf/PathExtension.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,10 @@ public static VcfFile getVcf(Path self) throws FileNotFoundException, IOExceptio
return VcfFileUtil.load(self);

}


public static VcfFile getVcf(Path self, String phasingDetectionStrategyName) throws IOException {
return VcfFileUtil.load(self, phasingDetectionStrategyName);

}

}
75 changes: 63 additions & 12 deletions src/main/java/genepi/nf/test/vcf/VcfFileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,72 @@
import java.nio.file.Path;
import java.util.HashSet;
import java.util.Set;
import java.util.function.BiFunction;
import java.util.function.Function;

import genepi.io.text.LineReader;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;

public class VcfFileUtil {

public enum PhasingDetectionStrategy {
DEFAULT(
header -> {
// By default, assume the VCF is phased.
return true;
},
(tiles, previous) -> {
boolean phased = previous;
// If any of the variation is not phased, the whole VCF is neither.
if (phased) {
boolean containsSymbol = tiles[9].contains("/");

if (containsSymbol) {
phased = false;
}
}
return phased;
}
),
ONE_FOR_ALL(
header -> {
// By default, assume the VCF is *not* phased.
return false;
},
(tiles, previous) -> {
// If any of the variation is phased, the whole VCF is.
var phased = previous;
if (phased) {
return phased;
}
boolean containsSymbol = tiles[9].contains("/");
return containsSymbol;
}
),
;

public Function<VCFHeader, Boolean> detectFromHeader;

public BiFunction<String[], Boolean, Boolean> detectFromVariation;

PhasingDetectionStrategy(Function<VCFHeader, Boolean> detectFromHeader, BiFunction<String[], Boolean, Boolean> detectFromVariation) {
this.detectFromHeader = detectFromHeader;
this.detectFromVariation = detectFromVariation;
}


}

public static VcfFile load(Path vcfFilename) throws IOException {
return load(vcfFilename, PhasingDetectionStrategy.DEFAULT);
}

public static VcfFile load(Path vcfFilename, String phasingDetectionStrategyName) throws IOException {
return load(vcfFilename, PhasingDetectionStrategy.valueOf(phasingDetectionStrategyName));
}

public static VcfFile load(Path vcfFilename, PhasingDetectionStrategy phasingDetectionStrategy) throws IOException {

Set<String> chromosomes = new HashSet<String>();
int snps = 0;
Expand All @@ -29,7 +87,7 @@ public static VcfFile load(Path vcfFilename) throws IOException {

LineReader lineReader = new LineReader(vcfFilename.toString());

boolean phased = true;
boolean phased = phasingDetectionStrategy.detectFromHeader.apply(header);
boolean phasedAutodetect = true;
boolean firstLine = true;
while (lineReader.next()) {
Expand All @@ -46,22 +104,15 @@ public static VcfFile load(Path vcfFilename) throws IOException {

if (samples > 0) {

if (phased) {
boolean containsSymbol = tiles[9].contains("/");

if (containsSymbol) {
phased = false;
}

}
phased = phasingDetectionStrategy.detectFromVariation.apply(tiles, phased);

if (firstLine) {
boolean containsSymbol = tiles[9].contains("/") || tiles[9].contains(".");

if (!containsSymbol) {
phasedAutodetect = true;
} else {
if (containsSymbol) {
phasedAutodetect = false;
} else {
phasedAutodetect = true;
}
firstLine = false;

Expand Down
4 changes: 2 additions & 2 deletions tests.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
#!/bin/bash
nf-test test tests/test_vcf.nf.test --plugins target/nft-vcf-1.*.jar
#!/usr/bin/env bash
nf-test test tests/test_vcf.nf.test --plugins target/nft-vcf-1.*.jar
247 changes: 247 additions & 0 deletions tests/data/valid/phased-and-unphased.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=RefCall,Description="Genotyping model thinks this site is reference.">
##FILTER=<ID=LowQual,Description="Confidence in this variant being real is below calling threshold.">
##FILTER=<ID=NoCall,Description="Site has depth=0 resulting in no call.">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position (for use with symbolic alleles)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block.">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=VAF,Number=A,Type=Float,Description="Variant allele fractions.">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods rounded to the closest integer">
##FORMAT=<ID=MED_DP,Number=1,Type=Integer,Description="Median DP observed within the GVCF block rounded to the nearest integer.">
##DeepVariant_version=1.5.0
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
##contig=<ID=chrY,length=57227415>
##contig=<ID=chrM,length=16569>
##contig=<ID=chr1_KI270706v1_random,length=175055>
##contig=<ID=chr1_KI270707v1_random,length=32032>
##contig=<ID=chr1_KI270708v1_random,length=127682>
##contig=<ID=chr1_KI270709v1_random,length=66860>
##contig=<ID=chr1_KI270710v1_random,length=40176>
##contig=<ID=chr1_KI270711v1_random,length=42210>
##contig=<ID=chr1_KI270712v1_random,length=176043>
##contig=<ID=chr1_KI270713v1_random,length=40745>
##contig=<ID=chr1_KI270714v1_random,length=41717>
##contig=<ID=chr2_KI270715v1_random,length=161471>
##contig=<ID=chr2_KI270716v1_random,length=153799>
##contig=<ID=chr3_GL000221v1_random,length=155397>
##contig=<ID=chr4_GL000008v2_random,length=209709>
##contig=<ID=chr5_GL000208v1_random,length=92689>
##contig=<ID=chr9_KI270717v1_random,length=40062>
##contig=<ID=chr9_KI270718v1_random,length=38054>
##contig=<ID=chr9_KI270719v1_random,length=176845>
##contig=<ID=chr9_KI270720v1_random,length=39050>
##contig=<ID=chr11_KI270721v1_random,length=100316>
##contig=<ID=chr14_GL000009v2_random,length=201709>
##contig=<ID=chr14_GL000225v1_random,length=211173>
##contig=<ID=chr14_KI270722v1_random,length=194050>
##contig=<ID=chr14_GL000194v1_random,length=191469>
##contig=<ID=chr14_KI270723v1_random,length=38115>
##contig=<ID=chr14_KI270724v1_random,length=39555>
##contig=<ID=chr14_KI270725v1_random,length=172810>
##contig=<ID=chr14_KI270726v1_random,length=43739>
##contig=<ID=chr15_KI270727v1_random,length=448248>
##contig=<ID=chr16_KI270728v1_random,length=1872759>
##contig=<ID=chr17_GL000205v2_random,length=185591>
##contig=<ID=chr17_KI270729v1_random,length=280839>
##contig=<ID=chr17_KI270730v1_random,length=112551>
##contig=<ID=chr22_KI270731v1_random,length=150754>
##contig=<ID=chr22_KI270732v1_random,length=41543>
##contig=<ID=chr22_KI270733v1_random,length=179772>
##contig=<ID=chr22_KI270734v1_random,length=165050>
##contig=<ID=chr22_KI270735v1_random,length=42811>
##contig=<ID=chr22_KI270736v1_random,length=181920>
##contig=<ID=chr22_KI270737v1_random,length=103838>
##contig=<ID=chr22_KI270738v1_random,length=99375>
##contig=<ID=chr22_KI270739v1_random,length=73985>
##contig=<ID=chrY_KI270740v1_random,length=37240>
##contig=<ID=chrUn_KI270302v1,length=2274>
##contig=<ID=chrUn_KI270304v1,length=2165>
##contig=<ID=chrUn_KI270303v1,length=1942>
##contig=<ID=chrUn_KI270305v1,length=1472>
##contig=<ID=chrUn_KI270322v1,length=21476>
##contig=<ID=chrUn_KI270320v1,length=4416>
##contig=<ID=chrUn_KI270310v1,length=1201>
##contig=<ID=chrUn_KI270316v1,length=1444>
##contig=<ID=chrUn_KI270315v1,length=2276>
##contig=<ID=chrUn_KI270312v1,length=998>
##contig=<ID=chrUn_KI270311v1,length=12399>
##contig=<ID=chrUn_KI270317v1,length=37690>
##contig=<ID=chrUn_KI270412v1,length=1179>
##contig=<ID=chrUn_KI270411v1,length=2646>
##contig=<ID=chrUn_KI270414v1,length=2489>
##contig=<ID=chrUn_KI270419v1,length=1029>
##contig=<ID=chrUn_KI270418v1,length=2145>
##contig=<ID=chrUn_KI270420v1,length=2321>
##contig=<ID=chrUn_KI270424v1,length=2140>
##contig=<ID=chrUn_KI270417v1,length=2043>
##contig=<ID=chrUn_KI270422v1,length=1445>
##contig=<ID=chrUn_KI270423v1,length=981>
##contig=<ID=chrUn_KI270425v1,length=1884>
##contig=<ID=chrUn_KI270429v1,length=1361>
##contig=<ID=chrUn_KI270442v1,length=392061>
##contig=<ID=chrUn_KI270466v1,length=1233>
##contig=<ID=chrUn_KI270465v1,length=1774>
##contig=<ID=chrUn_KI270467v1,length=3920>
##contig=<ID=chrUn_KI270435v1,length=92983>
##contig=<ID=chrUn_KI270438v1,length=112505>
##contig=<ID=chrUn_KI270468v1,length=4055>
##contig=<ID=chrUn_KI270510v1,length=2415>
##contig=<ID=chrUn_KI270509v1,length=2318>
##contig=<ID=chrUn_KI270518v1,length=2186>
##contig=<ID=chrUn_KI270508v1,length=1951>
##contig=<ID=chrUn_KI270516v1,length=1300>
##contig=<ID=chrUn_KI270512v1,length=22689>
##contig=<ID=chrUn_KI270519v1,length=138126>
##contig=<ID=chrUn_KI270522v1,length=5674>
##contig=<ID=chrUn_KI270511v1,length=8127>
##contig=<ID=chrUn_KI270515v1,length=6361>
##contig=<ID=chrUn_KI270507v1,length=5353>
##contig=<ID=chrUn_KI270517v1,length=3253>
##contig=<ID=chrUn_KI270529v1,length=1899>
##contig=<ID=chrUn_KI270528v1,length=2983>
##contig=<ID=chrUn_KI270530v1,length=2168>
##contig=<ID=chrUn_KI270539v1,length=993>
##contig=<ID=chrUn_KI270538v1,length=91309>
##contig=<ID=chrUn_KI270544v1,length=1202>
##contig=<ID=chrUn_KI270548v1,length=1599>
##contig=<ID=chrUn_KI270583v1,length=1400>
##contig=<ID=chrUn_KI270587v1,length=2969>
##contig=<ID=chrUn_KI270580v1,length=1553>
##contig=<ID=chrUn_KI270581v1,length=7046>
##contig=<ID=chrUn_KI270579v1,length=31033>
##contig=<ID=chrUn_KI270589v1,length=44474>
##contig=<ID=chrUn_KI270590v1,length=4685>
##contig=<ID=chrUn_KI270584v1,length=4513>
##contig=<ID=chrUn_KI270582v1,length=6504>
##contig=<ID=chrUn_KI270588v1,length=6158>
##contig=<ID=chrUn_KI270593v1,length=3041>
##contig=<ID=chrUn_KI270591v1,length=5796>
##contig=<ID=chrUn_KI270330v1,length=1652>
##contig=<ID=chrUn_KI270329v1,length=1040>
##contig=<ID=chrUn_KI270334v1,length=1368>
##contig=<ID=chrUn_KI270333v1,length=2699>
##contig=<ID=chrUn_KI270335v1,length=1048>
##contig=<ID=chrUn_KI270338v1,length=1428>
##contig=<ID=chrUn_KI270340v1,length=1428>
##contig=<ID=chrUn_KI270336v1,length=1026>
##contig=<ID=chrUn_KI270337v1,length=1121>
##contig=<ID=chrUn_KI270363v1,length=1803>
##contig=<ID=chrUn_KI270364v1,length=2855>
##contig=<ID=chrUn_KI270362v1,length=3530>
##contig=<ID=chrUn_KI270366v1,length=8320>
##contig=<ID=chrUn_KI270378v1,length=1048>
##contig=<ID=chrUn_KI270379v1,length=1045>
##contig=<ID=chrUn_KI270389v1,length=1298>
##contig=<ID=chrUn_KI270390v1,length=2387>
##contig=<ID=chrUn_KI270387v1,length=1537>
##contig=<ID=chrUn_KI270395v1,length=1143>
##contig=<ID=chrUn_KI270396v1,length=1880>
##contig=<ID=chrUn_KI270388v1,length=1216>
##contig=<ID=chrUn_KI270394v1,length=970>
##contig=<ID=chrUn_KI270386v1,length=1788>
##contig=<ID=chrUn_KI270391v1,length=1484>
##contig=<ID=chrUn_KI270383v1,length=1750>
##contig=<ID=chrUn_KI270393v1,length=1308>
##contig=<ID=chrUn_KI270384v1,length=1658>
##contig=<ID=chrUn_KI270392v1,length=971>
##contig=<ID=chrUn_KI270381v1,length=1930>
##contig=<ID=chrUn_KI270385v1,length=990>
##contig=<ID=chrUn_KI270382v1,length=4215>
##contig=<ID=chrUn_KI270376v1,length=1136>
##contig=<ID=chrUn_KI270374v1,length=2656>
##contig=<ID=chrUn_KI270372v1,length=1650>
##contig=<ID=chrUn_KI270373v1,length=1451>
##contig=<ID=chrUn_KI270375v1,length=2378>
##contig=<ID=chrUn_KI270371v1,length=2805>
##contig=<ID=chrUn_KI270448v1,length=7992>
##contig=<ID=chrUn_KI270521v1,length=7642>
##contig=<ID=chrUn_GL000195v1,length=182896>
##contig=<ID=chrUn_GL000219v1,length=179198>
##contig=<ID=chrUn_GL000220v1,length=161802>
##contig=<ID=chrUn_GL000224v1,length=179693>
##contig=<ID=chrUn_KI270741v1,length=157432>
##contig=<ID=chrUn_GL000226v1,length=15008>
##contig=<ID=chrUn_GL000213v1,length=164239>
##contig=<ID=chrUn_KI270743v1,length=210658>
##contig=<ID=chrUn_KI270744v1,length=168472>
##contig=<ID=chrUn_KI270745v1,length=41891>
##contig=<ID=chrUn_KI270746v1,length=66486>
##contig=<ID=chrUn_KI270747v1,length=198735>
##contig=<ID=chrUn_KI270748v1,length=93321>
##contig=<ID=chrUn_KI270749v1,length=158759>
##contig=<ID=chrUn_KI270750v1,length=148850>
##contig=<ID=chrUn_KI270751v1,length=150742>
##contig=<ID=chrUn_KI270752v1,length=27745>
##contig=<ID=chrUn_KI270753v1,length=62944>
##contig=<ID=chrUn_KI270754v1,length=40191>
##contig=<ID=chrUn_KI270755v1,length=36723>
##contig=<ID=chrUn_KI270756v1,length=79590>
##contig=<ID=chrUn_KI270757v1,length=71251>
##contig=<ID=chrUn_GL000214v1,length=137718>
##contig=<ID=chrUn_KI270742v1,length=186739>
##contig=<ID=chrUn_GL000216v2,length=176608>
##contig=<ID=chrUn_GL000218v1,length=161147>
##contig=<ID=chrEBV,length=171823>
##contig=<ID=MAP2K3_chr17_22578583_22605165,length=26582>
##contig=<ID=KCNJ18_chr17_22629421_22688415,length=58994>
##contig=<ID=KMT2C_chr21_7687010_7731520,length=44510>
##contig=<ID=KMT2C_chr15_6020392_6062903,length=42511>
##contig=<ID=KMT2C_chr13_14464876_14502703,length=37827>
##contig=<ID=KMT2C_chr13_14211849_14228603,length=16754>
##contig=<ID=KMT2C_chr13_10688343_10754814,length=66471>
##contig=<ID=KMT2C_chr13_10744786_10790087,length=45301>
##contig=<ID=KMT2C_chr13_11526357_11569988,length=43631>
##contig=<ID=KMT2C_chr13_11559977_11592630,length=32653>
##contig=<ID=KMT2C_chr13_14408161_14474907,length=66746>
##contig=<ID=KMT2C_chr14_3553442_3620349,length=66907>
##contig=<ID=KMT2C_chr14_3610318_3640421,length=30103>
##contig=<ID=KMT2C_chr15_5960377_6027329,length=66952>
##contig=<ID=KMT2C_chr2_89596961_89613143,length=16182>
##contig=<ID=KMT2C_chr2_90170767_90210396,length=39629>
##contig=<ID=KMT2C_chr21_7659652_7674726,length=15074>
##contig=<ID=KMT2C_chr21_8616365_8659521,length=43156>
##contig=<ID=KMT2C_chr21_8649510_8682182,length=32672>
##contig=<ID=KMT2C_chr22_6626735_6693166,length=66431>
##contig=<ID=KMT2C_chr22_6683135_6713219,length=30084>
##contig=<ID=KMT2C_chr22_11126384_11167180,length=40796>
##contig=<ID=KMT2C_chr22_11157169_11189815,length=32646>
##bcftools_filterVersion=1.18+htslib-1.18
##bcftools_filterCommand=filter --output test_S1-filtered.vcf.gz --threads 2 --output-type z -i '((ILEN < 1024 && ILEN > -1024) || TYPE!~"indel") && FILTER="PASS"' test_S1.vcf.gz; Date=Wed Oct 9 08:28:58 2024
##HiPhase_version="1.1.0-ae33ce1"
##HiPhase_command="hiphase --bam test_S1-aligned.bam --vcf test_S1-filtered.vcf.gz --output-vcf test_S1-phased.vcf.gz --output-bam test_S1-haplotagged.bam --reference GRCh38_GIABv3_no_alt_analysis_set_maskedGRC_decoys_MAP2K3_KMT2C_KCNJ18.fa --threads 2"
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set identifier">
##FORMAT=<ID=PF,Number=1,Type=String,Description="Phasing flag">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_S1
chrX 1189204 . A <*> 0 . END=1189206 GT:GQ:MIN_DP:PL 0/0:12:4:0,12,119
chrX 1189207 . T G 5.8 PASS . GT:GQ:DP:AD:VAF:PL:PS 0|1:5:4:1,3:0.75:4,0,11:1189207
chrX 1189208 . A T,<*> 5.4 PASS . GT:GQ:DP:AD:VAF:PL 0/1:5:4:0,3,0:0.75,0:3,0,10,990,990,990
chrX 1189209 . G <*> 0 . END=1189209 GT:GQ:MIN_DP:PL ./.:0:4:17,0,77
chrX 1189210 . T <*> 0 . END=1189210 GT:GQ:MIN_DP:PL 0/0:12:4:0,12,119
chrX 1189211 . G <*> 0 . END=1189211 GT:GQ:MIN_DP:PL ./.:0:4:17,0,77
chrX 1189212 . T <*> 0 . END=1189212 GT:GQ:MIN_DP:PL 0/0:12:4:0,12,119
Loading
Loading