Skip to content

Commit

Permalink
Add new tool ReblockGVCF (#4940)
Browse files Browse the repository at this point in the history
* adds ReblockGVCF a tool to merge reference blocks in single-sample GVCFs for smaller file size
  • Loading branch information
ldgauthier authored and lbergelson committed Oct 3, 2018
1 parent d12121c commit 7368c62
Show file tree
Hide file tree
Showing 33 changed files with 994 additions and 69 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ public final class GenotypeGVCFs extends VariantWalker {
public static final String PHASED_HOM_VAR_STRING = "1|1";
public static final String ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME = "only-output-calls-starting-in-intervals";
public static final String ALL_SITES_LONG_NAME = "include-non-variant-sites";
public static final String ALL_SITES_SHORT_NAME = "all-sites";
private static final String GVCF_BLOCK = "GVCFBlock";

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
Expand Down Expand Up @@ -271,29 +272,20 @@ private VariantContext calculateGenotypes(VariantContext vc){
/**
* Determines whether the provided VariantContext has real alternate alleles.
*
* There is a bit of a hack to handle the <NON-REF> case because it is not defined in htsjdk.Allele
* We check for this as a biallelic symbolic allele.
*
* @param vc the VariantContext to evaluate
* @return true if it has proper alternate alleles, false otherwise
*/
@VisibleForTesting
static boolean isProperlyPolymorphic(final VariantContext vc) {
public static boolean isProperlyPolymorphic(final VariantContext vc) {
//obvious cases
if (vc == null || vc.getAlternateAlleles().isEmpty()) {
return false;
} else if (vc.isBiallelic()) {
return !(isSpanningDeletion(vc.getAlternateAllele(0)) || vc.isSymbolic());
return !(GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0)) || vc.isSymbolic());
} else {
return true;
}
}

@VisibleForTesting
static boolean isSpanningDeletion(final Allele allele){
return allele.equals(Allele.SPAN_DEL) || allele.equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED);
}

/**
* Add genotyping-based annotations to the new VC
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,39 @@ public Map<String, Object> annotate(final ReferenceContext ref,
return Collections.emptyMap();
}

final int depth = getDepth(genotypes, likelihoods);

if ( depth == 0 ) {
return Collections.emptyMap();
}

final double qual = -10.0 * vc.getLog10PError();
double QD = qual / depth;

// Hack: see note in the fixTooHighQD method below
QD = fixTooHighQD(QD);

return Collections.singletonMap(getKeyNames().get(0), String.format("%.2f", QD));
}

/**
* The haplotype caller generates very high quality scores when multiple events are on the
* same haplotype. This causes some very good variants to have unusually high QD values,
* and VQSR will filter these out. This code looks at the QD value, and if it is above
* threshold we map it down to the mean high QD value, with some jittering
*
* @param QD the raw QD score
* @return a QD value
*/
public static double fixTooHighQD(final double QD) {
if ( QD < MAX_QD_BEFORE_FIXING ) {
return QD;
} else {
return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
}
}

public static int getDepth(final GenotypesContext genotypes, final ReadLikelihoods<Allele> likelihoods) {
int depth = 0;
int ADrestrictedDepth = 0;

Expand Down Expand Up @@ -100,35 +133,7 @@ public Map<String, Object> annotate(final ReferenceContext ref,
if ( ADrestrictedDepth > 0 ) {
depth = ADrestrictedDepth;
}

if ( depth == 0 ) {
return Collections.emptyMap();
}

final double qual = -10.0 * vc.getLog10PError();
double QD = qual / depth;

// Hack: see note in the fixTooHighQD method below
QD = fixTooHighQD(QD);

return Collections.singletonMap(getKeyNames().get(0), String.format("%.2f", QD));
}

/**
* The haplotype caller generates very high quality scores when multiple events are on the
* same haplotype. This causes some very good variants to have unusually high QD values,
* and VQSR will filter these out. This code looks at the QD value, and if it is above
* threshold we map it down to the mean high QD value, with some jittering
*
* @param QD the raw QD score
* @return a QD value
*/
public static double fixTooHighQD(final double QD) {
if ( QD < MAX_QD_BEFORE_FIXING ) {
return QD;
} else {
return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
}
return depth;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ public List<String> getKeyNames() {

@Override
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)), GATKVCFHeaderLines.getInfoLine(getRawKeyName()));
return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
}

@Override
public List<VCFInfoHeaderLine> getRawDescriptions() {
return getDescriptions();
return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getRawKeyName()));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume

public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";

/**
* You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This
Expand Down Expand Up @@ -73,7 +75,7 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume
* and end at 100 (exclusive).
*/
@Advanced
@Argument(fullName = "gvcf-gq-bands", shortName = "GQB", doc= "Exclusive upper bounds for reference confidence GQ bands " +
@Argument(fullName = GQ_BAND_LONG_NAME, shortName = GQ_BAND_SHORT_NAME, doc= "Exclusive upper bounds for reference confidence GQ bands " +
"(must be in [1, 100] and specified in increasing order)", optional = true)
public List<Integer> GVCFGQBands = new ArrayList<>(70);
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ protected String callSourceString() {

@Override
protected boolean forceKeepAllele(final Allele allele) {
return allele == Allele.NON_REF_ALLELE || referenceConfidenceMode != ReferenceConfidenceMode.NONE
return allele.equals(Allele.NON_REF_ALLELE,false) || referenceConfidenceMode != ReferenceConfidenceMode.NONE
|| configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES;
}

Expand Down
Loading

0 comments on commit 7368c62

Please sign in to comment.