Skip to content

Commit

Permalink
Added new argument "--variant-output-filtering" to variant walkers fo…
Browse files Browse the repository at this point in the history
…r customized filtering of output variants to intervals. (#6388)

Co-authored-by: James <[email protected]>
  • Loading branch information
lbergelson and jamesemery authored Sep 18, 2024
1 parent 2ba3a15 commit 6036d67
Show file tree
Hide file tree
Showing 18 changed files with 619 additions and 156 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ private StandardArgumentDefinitions(){}
public static final String INVALIDATE_PREVIOUS_FILTERS_LONG_NAME = "invalidate-previous-filters";
public static final String SORT_ORDER_LONG_NAME = "sort-order";
public static final String FLOW_ORDER_FOR_ANNOTATIONS = "flow-order-for-annotations";

public static final String VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME = "variant-output-filtering";

public static final String INPUT_SHORT_NAME = "I";
public static final String OUTPUT_SHORT_NAME = "O";
Expand Down
41 changes: 31 additions & 10 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
import java.util.*;
import java.util.stream.Stream;


import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
Expand Down Expand Up @@ -45,6 +47,7 @@
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.writers.ShardingVCFWriter;
import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;

/**
* Base class for all GATK tools. Tool authors that want to write a "GATK" tool but not use one of
Expand Down Expand Up @@ -417,6 +420,13 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
*/
public String getProgressMeterRecordLabel() { return ProgressMeter.DEFAULT_RECORD_LABEL; }

/**
* @return default value does no filtering. Override to change how variants are filtered against the intervals for your tools.
*/
public IntervalFilteringVcfWriter.Mode getVariantOutputFilteringMode(){
return IntervalFilteringVcfWriter.Mode.ANYWHERE;
}

protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInterval> getIntervals, final SAMSequenceDictionary sequenceDictionary) {
return getIntervals;
}
Expand Down Expand Up @@ -600,7 +610,7 @@ public boolean requiresIntervals() {

/**
* Does this tool want to disable the progress meter? If so, override here to return true
*
*
* @return true if this tools wants to disable progress meter output, otherwise false
*/
public boolean disableProgressMeter() {
Expand Down Expand Up @@ -727,12 +737,16 @@ protected void onStartup() {

initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source

if ( seqValidationArguments.performSequenceDictionaryValidation()) {
if (seqValidationArguments.performSequenceDictionaryValidation()) {
validateSequenceDictionaries();
}

checkToolRequirements();

if ((getVariantOutputFilteringMode() != IntervalFilteringVcfWriter.Mode.ANYWHERE ) && userIntervals == null){
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME + " was specified or if the tool uses interval filtering.");
}

initializeProgressMeter(getProgressMeterRecordLabel());
}

Expand Down Expand Up @@ -911,20 +925,27 @@ public VariantContextWriter createVCFWriter(final Path outPath) {
if (outputSitesOnlyVCFs) {
options.add(Options.DO_NOT_WRITE_GENOTYPES);
}

final VariantContextWriter unfilteredWriter;
if (maxVariantsPerShard > 0) {
return new ShardingVCFWriter(
unfilteredWriter = new ShardingVCFWriter(
outPath,
maxVariantsPerShard,
sequenceDictionary,
createOutputVariantMD5,
options.toArray(new Options[options.size()]));
options.toArray(new Options[0]));
} else {
unfilteredWriter = GATKVariantContextUtils.createVCFWriter(
outPath,
sequenceDictionary,
createOutputVariantMD5,
options.toArray(new Options[0]));
}
return GATKVariantContextUtils.createVCFWriter(
outPath,
sequenceDictionary,
createOutputVariantMD5,
options.toArray(new Options[options.size()]));

return getVariantOutputFilteringMode() == IntervalFilteringVcfWriter.Mode.ANYWHERE ?
unfilteredWriter :
new IntervalFilteringVcfWriter(unfilteredWriter,
intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()),
getVariantOutputFilteringMode());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.CountingVariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.transformers.VariantTransformer;
import org.broadinstitute.hellbender.utils.IndexUtils;
import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;

import java.util.Spliterator;
import java.util.stream.Stream;
Expand All @@ -33,6 +37,11 @@ public abstract class VariantWalkerBase extends WalkerBase {
* queries on the driving variants).
*/
public static final int DEFAULT_DRIVING_VARIANTS_LOOKAHEAD_BASES = 100_000;
@Argument(fullName = StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME,
doc = "Restrict the output variants to ones that match the specified intervals according to the specified matching mode.",
optional = true)
@Advanced
public IntervalFilteringVcfWriter.Mode userOutputVariantIntervalFilteringMode = null;

//Various options for reading from a GenomicsDB
protected GenomicsDBOptions genomicsDBOptions;
Expand Down Expand Up @@ -103,6 +112,16 @@ public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
*/
public abstract VCFHeader getHeaderForVariants();

@Override
public IntervalFilteringVcfWriter.Mode getVariantOutputFilteringMode() {
if (userOutputVariantIntervalFilteringMode != null) {
return userOutputVariantIntervalFilteringMode;
} else {
// Use whatever is the default provided by GATKTool
return super.getVariantOutputFilteringMode();
}
}

/**
* Return the primary sequence dictionary to be used for the driving variants for this tool. The value returned
* will usually have been prepared in {@link #initializeDrivingVariants}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
Expand All @@ -29,11 +33,23 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalSetRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;

import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -114,7 +130,7 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
/**
* Import all data between specified intervals. Improves performance using large lists of intervals, as in exome
* sequencing, especially if GVCF data only exists for specified intervals. Use with
* --only-output-calls-starting-in-intervals if input GVCFs contain calls outside the specified intervals.
* --{@value StandardArgumentDefinitions#VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME} if input GVCFs contain calls outside the specified intervals.
*/
@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
shortName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
Expand Down Expand Up @@ -155,16 +171,6 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
@ArgumentCollection
private GenomicsDBArgumentCollection genomicsdbArgs = new GenomicsDBArgumentCollection();

/**
* This option can only be activated if intervals are specified.
*/
@Advanced
@Argument(fullName= ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
doc="Restrict variant output to sites that start within provided intervals",
optional=true)
private boolean onlyOutputCallsStartingInIntervals = false;


@Argument(fullName = FORCE_OUTPUT_INTERVALS_NAME,
suppressFileExpansion = true, doc = "sites at which to output genotypes even if non-variant in samples", optional = true)
protected final List<String> forceOutputIntervalStrings = new ArrayList<>();
Expand All @@ -186,9 +192,6 @@ public final class GenotypeGVCFs extends VariantLocusWalker {

private VariantContextWriter vcfWriter;

/** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
private List<SimpleInterval> intervals;

private OverlapDetector<GenomeLoc> forceOutputIntervals;

private boolean forceOutputIntervalsPresent;
Expand Down Expand Up @@ -269,23 +272,14 @@ public void onTraversalStart() {

final VCFHeader inputVCFHeader = getHeaderForVariants();

if(onlyOutputCallsStartingInIntervals) {
if( !hasUserSuppliedIntervals()) {
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
}
}

intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

final Collection<Annotation> variantAnnotations = makeVariantAnnotations();
final Collection<Annotation> variantAnnotations = makeVariantAnnotations();
final Set<Annotation> annotationsToKeep = getAnnotationsToKeep();
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined, annotationsToKeep);

merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput, false, true);

//methods that cannot be called in engine bc its protected
Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
vcfWriter = createVCFWriter(outputFile);

//create engine object
Expand All @@ -294,7 +288,6 @@ public void onTraversalStart() {

//call initialize method in engine class that creates VCFWriter object and writes a header to it
vcfWriter = gvcfEngine.setupVCFWriter(defaultToolVCFHeaderLines, keepCombined, dbsnp, vcfWriter);

}

private Set<Annotation> getAnnotationsToKeep() {
Expand All @@ -316,9 +309,7 @@ public void apply(final Locatable loc, List<VariantContext> variants, ReadsConte
final VariantContext regenotypedVC = gvcfEngine.callRegion(loc, variants, ref, features, merger, somaticInput, tlodThreshold, afTolerance, forceOutput);

if (regenotypedVC != null) {
final SimpleInterval variantStart = new SimpleInterval(regenotypedVC.getContig(), regenotypedVC.getStart(), regenotypedVC.getStart());
if ((forceOutput || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC)) &&
(!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains (variantStart)))) {
if ((forceOutput || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC))) {
vcfWriter.add(regenotypedVC);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,6 @@ public final class GnarlyGenotyper extends VariantWalker {
@Argument(fullName = "keep-all-sites", doc="Retain low quality and non-variant sites, applying appropriate filters", optional=true)
private boolean keepAllSites = false;

/**
* This option can only be activated if intervals are specified.
*/
@Advanced
@Argument(fullName = GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
doc="Restrict variant output to sites that start within provided intervals",
optional=true)
private boolean onlyOutputCallsStartingInIntervals = false;

@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
shortName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
doc = "Boolean flag to read in all data in between intervals. Improves performance reading from GenomicsDB " +
Expand Down Expand Up @@ -145,9 +136,6 @@ public final class GnarlyGenotyper extends VariantWalker {
private final RMSMappingQuality mqCalculator = RMSMappingQuality.getInstance();
private final Set<Class<? extends InfoFieldAnnotation>> allAlleleSpecificAnnotations = new HashSet<>();

/** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
private List<SimpleInterval> intervals;

@Override
public boolean requiresReference() {
return true;
Expand Down Expand Up @@ -182,14 +170,6 @@ protected GenomicsDBOptions getGenomicsDBOptions() {
public void onTraversalStart() {
final VCFHeader inputVCFHeader = getHeaderForVariants();

if(onlyOutputCallsStartingInIntervals) {
if( !intervalArgumentCollection.intervalsSpecified()) {
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
}
}
intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples());

setupVCFWriter(inputVCFHeader, samples);
Expand Down Expand Up @@ -260,11 +240,10 @@ private void setupVCFWriter(VCFHeader inputVCFHeader, SampleList samples) {
@SuppressWarnings({"unchecked", "rawtypes"})
@Override
public void apply(VariantContext variant, ReadsContext reads, ReferenceContext ref, FeatureContext features) {
SimpleInterval variantStart = new SimpleInterval(variant.getContig(), variant.getStart(), variant.getStart());
//return early if there's no non-symbolic ALT since GDB already did the merging
if ( !variant.isVariant() || !GATKVariantContextUtils.isProperlyPolymorphic(variant)
|| variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0
|| (onlyOutputCallsStartingInIntervals && !intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
|| variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0 )
{
if (keepAllSites) {
VariantContextBuilder builder = new VariantContextBuilder(mqCalculator.finalizeRawMQ(variant)); //don't fill in QUAL here because there's no alt data
builder.filter(GATKVCFConstants.LOW_QUAL_FILTER_NAME);
Expand All @@ -291,7 +270,7 @@ public void apply(VariantContext variant, ReadsContext reads, ReferenceContext r
finalizedVC = genotyperEngine.finalizeGenotype(variant);
}
//could return null if the variant didn't pass the genotyping arg calling/emission threshold
if (finalizedVC != null && (!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
if (finalizedVC != null) {
vcfWriter.add(finalizedVC);
}
}
Expand Down
Loading

0 comments on commit 6036d67

Please sign in to comment.