Skip to content

Commit

Permalink
Remove bad "safety check" in GGVCFs (#7772)
Browse files Browse the repository at this point in the history
All hom-ref sites with unnormalizes PLs threw exceptions because they violated assumptions -- remove the IllegalStateException check on monomorphic sites
  • Loading branch information
ldgauthier authored Apr 12, 2022
1 parent 3b0bc03 commit 33bda5e
Show file tree
Hide file tree
Showing 8 changed files with 20 additions and 3,386 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -287,18 +287,15 @@ else if ( clazz.equals(int[].class) ) {
*
* @param vc target variant context.
* @param numAltAllelesToKeep number of alt alleles to keep.
* @param ensureReturnContainsAlt make sure the alleles returned include an alternate, even if it's not in the most likely genotype
* @return the list of alleles to keep, including the reference and {@link Allele#NON_REF_ALLELE} if present
*
*/
public static List<Allele> calculateMostLikelyAlleles(final VariantContext vc, final int defaultPloidy,
final int numAltAllelesToKeep) {
final int numAltAllelesToKeep, boolean ensureReturnContainsAlt) {
Utils.nonNull(vc, "vc is null");
Utils.validateArg(defaultPloidy > 0, () -> "default ploidy must be > 0 but defaultPloidy=" + defaultPloidy);
Utils.validateArg(numAltAllelesToKeep > 0, () -> "numAltAllelesToKeep must be > 0, but numAltAllelesToKeep=" + numAltAllelesToKeep);
//allow PLs or GPs (as for GATK-DRAGEN), but we need some kind of genotype data
Utils.validateArg(vc.getGenotypes().stream().anyMatch(g -> g.hasPL() || g.hasExtendedAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY)), () -> "Most likely alleles cannot be calculated without likelihoods");
//NOTE: this is used in the reblocking case when we have a hom-ref GT and real ALTs
final boolean allHomRefData = vc.getGenotypes().stream().allMatch(g -> g.hasPL() && g.getPL()[0] == 0); //PL=[0,0,0] is okay, we just don't want confident variants

final boolean hasSymbolicNonRef = vc.hasAllele(Allele.NON_REF_ALLELE);
final int numberOfAllelesThatArentProperAlts = hasSymbolicNonRef ? 2 : 1;
Expand All @@ -308,11 +305,7 @@ public static List<Allele> calculateMostLikelyAlleles(final VariantContext vc, f
return vc.getAlleles();
}

final double[] likelihoodSums = calculateLikelihoodSums(vc, defaultPloidy, allHomRefData);
if (MathUtils.sum(likelihoodSums) == 0.0 && !allHomRefData) {
throw new IllegalStateException("No likelihood sum exceeded zero -- method was called for variant data " +
"with no variant information.");
}
final double[] likelihoodSums = calculateLikelihoodSums(vc, defaultPloidy, ensureReturnContainsAlt);
return filterToMaxNumberOfAltAllelesBasedOnScores(numAltAllelesToKeep, vc.getAlleles(), likelihoodSums);
}

Expand Down Expand Up @@ -342,17 +335,20 @@ public static List<Allele> filterToMaxNumberOfAltAllelesBasedOnScores(int numAlt
*
* Since GLs are log likelihoods, this quantity is thus
* SUM_{samples whose likeliest genotype contains this alt allele} log(likelihood alt / likelihood hom ref)
* @param vc
* @param defaultPloidy
* @param countAllelesWithoutHomRef true if we know the input is hom-ref, but we still want to know the most likely ALT
*/
@VisibleForTesting
static double[] calculateLikelihoodSums(final VariantContext vc, final int defaultPloidy, final boolean allHomRefData) {
static double[] calculateLikelihoodSums(final VariantContext vc, final int defaultPloidy, final boolean countAllelesWithoutHomRef) {
final double[] likelihoodSums = new double[vc.getNAlleles()];
for ( final Genotype genotype : vc.getGenotypes().iterateInSampleNameOrder() ) {
final GenotypeLikelihoods gls = genotype.getLikelihoods();
if (gls == null) {
continue;
}
final double[] glsVector = gls.getAsVector();
final int indexOfMostLikelyVariantGenotype = MathUtils.maxElementIndex(glsVector, allHomRefData ? 1 : 0, glsVector.length);
final int indexOfMostLikelyVariantGenotype = MathUtils.maxElementIndex(glsVector, countAllelesWithoutHomRef ? 1 : 0, glsVector.length);
final double GLDiffBetweenRefAndBestVariantGenotype = Math.abs(glsVector[indexOfMostLikelyVariantGenotype] - glsVector[PL_INDEX_OF_HOM_REF]);
final int ploidy = genotype.getPloidy() > 0 ? genotype.getPloidy() : defaultPloidy;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public VariantContext calculateGenotypes(final VariantContext vc, final Genotype

VariantContext reducedVC = vc;
if (maxAltAlleles < vc.getAlternateAlleles().size()) {
final List<Allele> allelesToKeep = AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, defaultPloidy, maxAltAlleles);
final List<Allele> allelesToKeep = AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, defaultPloidy, maxAltAlleles, false);
final GenotypesContext reducedGenotypes = allelesToKeep.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) :
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), allelesToKeep, gpc,
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL); //with no PLs in some reblocked GVCFs, no-calls are just going to cause problems, so keep 0/0 genotypes as such without trying to recall
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,8 @@ protected GenotypeBuilder changeCallToHomRefVersusNonRef(final VariantContext lo
if (posteriorsKey != null && genotype.hasExtendedAttribute(posteriorsKey)) {
subsetHomRefPosteriorsToRefVersusNonRef(lowQualVariant, gb);
} else {
final List<Allele> bestAlleles = AlleleSubsettingUtils.calculateMostLikelyAlleles(lowQualVariant, genotype.getPloidy(), 1);
//find best ALT so we can use its likelihood for NON_REF
final List<Allele> bestAlleles = AlleleSubsettingUtils.calculateMostLikelyAlleles(lowQualVariant, genotype.getPloidy(), 1, true);
final Allele bestAlt = bestAlleles.stream().filter(a -> !a.isReference()).findFirst().orElse(Allele.NON_REF_ALLELE); //allow span dels
//we care about the best alt even though it's getting removed because NON_REF should get the best likelihoods
//it shouldn't matter that we're passing in different alt alleles since the GenotypesContext only knows
Expand Down Expand Up @@ -928,7 +929,7 @@ private static int[] getGenotypePosteriorsOtherwiseLikelihoods(final Genotype ge
private void subsetHomRefPosteriorsToRefVersusNonRef(final VariantContext result, final GenotypeBuilder gb) {
//TODO: bestAlleles needs to be modified for posteriors
final Genotype genotype = result.getGenotype(0);
final List<Allele> bestAlleles = AlleleSubsettingUtils.calculateMostLikelyAlleles(result, genotype.getPloidy(), 1);
final List<Allele> bestAlleles = AlleleSubsettingUtils.calculateMostLikelyAlleles(result, genotype.getPloidy(), 1, false);
final Allele bestAlt = bestAlleles.stream().filter(a -> !a.isReference()).findFirst().orElse(Allele.NON_REF_ALLELE); //allow span dels
final int[] idxVector = result.getGLIndicesOfAlternateAllele(bestAlt);
final int[] multiallelicPLs = getGenotypePosteriorsOtherwiseLikelihoods(genotype, posteriorsKey);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ public Object[][] gvcfsToGenotype() {
b37_reference_20_21},

//23 highly multi-allelic sites across 54 1000G exomes to test allele subsetting and QUAL calculation
//plus one 10-allele WGS variant that's all hom-ref with one GT that has unnormalized PLs from some sort of GenomicsDB corner case
{getTestFile("multiallelicQualRegression.vcf "),
getTestFile("multiallelicQualRegression.expected.vcf"),
NO_EXTRA_ARGS, hg38Reference}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.genotyper;

import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.GATKBaseTest;
Expand Down Expand Up @@ -288,7 +287,7 @@ public Object[][] makeUpdatePLsSACsAndADData() {
public void testCalculateMostLikelyAllelesTieDoesntRemoveAllTiedAlleles(){
VariantContext vc = new VariantContextBuilder(null, "1", 100, 100, Arrays.asList(Aref, C, G))
.genotypes(Arrays.asList(new GenotypeBuilder("sample1", Arrays.asList(C,G)).PL( new double[]{5, 5, 5, 5, 0, 5}).make())).make();
Assert.assertEquals(AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, 2, 1), Arrays.asList(Aref,C)) ;
Assert.assertEquals(AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, 2, 1, false), Arrays.asList(Aref,C)) ;
}

@DataProvider
Expand All @@ -315,9 +314,9 @@ public void testThatFilteringWorksCorrectly(int numToKeep, List<Allele> alleles,
@Test
public void testCalculateMostLikelyAllelesPreconditions(){
VariantContext vc = new VariantContextBuilder(null, "1", 100, 100, Arrays.asList(Aref, C, G)).make();
Assert.assertThrows(IllegalArgumentException.class, () -> AlleleSubsettingUtils.calculateMostLikelyAlleles(null, 2, 2));
Assert.assertThrows(IllegalArgumentException.class, () -> AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, 0, 2));
Assert.assertThrows(IllegalArgumentException.class, () -> AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, 2, 0));
Assert.assertThrows(IllegalArgumentException.class, () -> AlleleSubsettingUtils.calculateMostLikelyAlleles(null, 2, 2, false));
Assert.assertThrows(IllegalArgumentException.class, () -> AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, 0, 2, false));
Assert.assertThrows(IllegalArgumentException.class, () -> AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, 2, 0, false));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1404,7 +1404,7 @@ public void testMaxAlternateAlleles(final String bam, final String reference, fi
final Map<SimpleInterval, List<Allele>> expectedSubsettedAllelesByLocus = new HashMap<>();
for ( final VariantContext vc : callsNoMaxAlternateAlleles ) {
if ( getNumAltAllelesExcludingNonRef(vc) > maxAlternateAlleles ) {
final List<Allele> mostLikelyAlleles = AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, HomoSapiensConstants.DEFAULT_PLOIDY, maxAlternateAlleles);
final List<Allele> mostLikelyAlleles = AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, HomoSapiensConstants.DEFAULT_PLOIDY, maxAlternateAlleles, false);
expectedSubsettedAllelesByLocus.put(new SimpleInterval(vc), mostLikelyAlleles);
}
}
Expand Down
Loading

0 comments on commit 33bda5e

Please sign in to comment.