diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 5224fe5f7d6..5f1a2dc17ab 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -13,6 +13,7 @@ public final class GATKSVVCFConstants { // VCF standard keys reserved for sv public static final String SVTYPE = "SVTYPE"; public static final String SVLEN = "SVLEN"; + public static final String EVIDENCE = "EVIDENCE"; public static final String IMPRECISE = "IMPRECISE"; public static final String CIPOS = "CIPOS"; public static final String CIEND = "CIEND"; @@ -31,6 +32,14 @@ public final class GATKSVVCFConstants { public static final Allele DEL_ALLELE = Allele.create("", false); public static final Allele DUP_ALLELE = Allele.create("", false); + // Evidence types + public enum EvidenceTypes { + BAF, + PE, + RD, + SR + } + // GATK-SV specific header lines // TODO: 10/3/17 the following comment is a goal we are trying to achieve // applicable to all records all the time @@ -136,8 +145,13 @@ public enum ComplexVariantSubtype { public static final String BND_DELETION_STRANDS = "+-"; public static final String BND_DUPLICATION_STRANDS = "-+"; + // SR support + public static final String BOTHSIDES_SUPPORT_ATTRIBUTE = "BOTHSIDES_SUPPORT"; + public static final String HIGH_SR_BACKGROUND_ATTRIBUTE = "HIGH_SR_BACKGROUND"; + // format block public static final String COPY_NUMBER_FORMAT = "CN"; + public static final String DEPTH_GENOTYPE_COPY_NUMBER_FORMAT = "RD_CN"; public static final String EXPECTED_COPY_NUMBER_FORMAT = "ECN"; public static final String COPY_NUMBER_QUALITY_FORMAT = "CNQ"; @@ -175,6 +189,9 @@ public enum ComplexVariantSubtype { public static final String TRUTH_ALLELE_NUMBER_INFO = "TRUTH_AN"; public static final String TRUTH_ALLELE_FREQUENCY_INFO = "TRUTH_AF"; + // stratification + public static final String STRATUM_INFO_KEY = "STRAT"; + // functional annotations public static final String LOF = "PREDICTED_LOF"; public static final String INT_EXON_DUP = "PREDICTED_INTRAGENIC_EXON_DUP"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java index 3f3258d6161..3b0466f4bd6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java @@ -21,6 +21,7 @@ import java.util.stream.Stream; import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT; +import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT; public class SVCallRecord implements SVLocatable { @@ -31,6 +32,7 @@ public class SVCallRecord implements SVLocatable { VCFConstants.END_KEY, GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, GATKSVVCFConstants.SVLEN, + GATKSVVCFConstants.EVIDENCE, GATKSVVCFConstants.CONTIG2_ATTRIBUTE, GATKSVVCFConstants.END2_ATTRIBUTE, GATKSVVCFConstants.STRANDS_ATTRIBUTE, @@ -48,6 +50,7 @@ public class SVCallRecord implements SVLocatable { private final Boolean strandB; private final GATKSVVCFConstants.StructuralVariantAnnotationType type; private final Integer length; + private final List evidence; private final List algorithms; private final List alleles; private final Allele refAllele; @@ -72,6 +75,7 @@ public SVCallRecord(final String id, final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype, final List cpxIntervals, final Integer length, + final List evidence, final List algorithms, final List alleles, final List genotypes, @@ -79,7 +83,7 @@ public SVCallRecord(final String id, final Set filters, final Double log10PError, final SAMSequenceDictionary dictionary) { - this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError); + this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, evidence, algorithms, alleles, genotypes, attributes, filters, log10PError); validateCoordinates(dictionary); } @@ -94,6 +98,7 @@ protected SVCallRecord(final String id, final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype, final List cpxIntervals, final Integer length, + final List evidence, final List algorithms, final List alleles, final List genotypes, @@ -106,6 +111,7 @@ protected SVCallRecord(final String id, Utils.nonNull(attributes); Utils.nonNull(filters); Utils.nonNull(cpxIntervals); + Utils.nonNull(evidence); this.id = Utils.nonNull(id); this.contigA = contigA; this.positionA = positionA; @@ -123,6 +129,7 @@ protected SVCallRecord(final String id, this.genotypes = GenotypesContext.copy(genotypes).immutable(); this.attributes = validateAttributes(attributes); this.length = inferLength(type, positionA, positionB, length); + this.evidence = evidence; final Pair strands = inferStrands(type, strandA, strandB); this.strandA = strands.getLeft(); this.strandB = strands.getRight(); @@ -272,7 +279,8 @@ private boolean isCarrier(final Genotype genotype) { } // Otherwise, try to infer status if it's a biallelic CNV with a copy number call - final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, expectedCopyNumber); + final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, + VariantContextGetters.getAttributeAsInt(genotype, DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, expectedCopyNumber)); if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL) { return copyNumber < expectedCopyNumber; } else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) { @@ -370,6 +378,10 @@ public Integer getLength() { return length; } + public List getEvidence() { + return evidence; + } + public List getAlgorithms() { return algorithms; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java index cf31d654727..4a13d62119e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java @@ -18,6 +18,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static htsjdk.variant.vcf.VCFConstants.MISSING_VALUE_v4; import static org.broadinstitute.hellbender.tools.sv.SVCallRecord.UNDEFINED_LENGTH; public final class SVCallRecordUtils { @@ -91,6 +92,9 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record) && record.getStrandA() != null && record.getStrandB() != null) { builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record)); } + if (!record.getEvidence().isEmpty()) { + builder.attribute(GATKSVVCFConstants.EVIDENCE, record.getEvidence()); + } if (!record.getFilters().isEmpty()) { builder.filters(record.getFilters()); } @@ -173,12 +177,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin */ public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) { return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(), - record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(), + record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(), genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError()); } public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map attr) { return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(), - record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(), + record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), attr, record.getFilters(), record.getLog10PError()); } @@ -291,10 +295,10 @@ public static Stream convertInversionsToBreakends(final SVCallReco Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal"); final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null, - record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary); + record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary); final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null, - record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary); + record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary); return Stream.of(positiveBreakend, negativeBreakend); } @@ -319,8 +323,9 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari final GATKSVVCFConstants.StructuralVariantAnnotationType type = inferStructuralVariantType(variant); final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype = getComplexSubtype(variant); - final List cpxIntervals = parseComplexIntervals(variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null), dictionary); + final List cpxIntervals = parseComplexIntervals(variant, dictionary); final List algorithms = getAlgorithms(variant); + final List evidence = getEvidence(variant); final String strands; if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL @@ -375,12 +380,13 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari final Map sanitizedAttributes = sanitizeAttributes(attributes); return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype, - cpxIntervals, length, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes, + cpxIntervals, length, evidence, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes, variant.getFilters(), log10PError); } - private static List parseComplexIntervals(final List intervals, final SAMSequenceDictionary dictionary) { - return intervals.stream().map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList(); + private static List parseComplexIntervals(final VariantContext variant, final SAMSequenceDictionary dictionary) { + return variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null).stream() + .map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList(); } private static Map sanitizeAttributes(final Map attributes) { @@ -402,6 +408,19 @@ private static Integer getLength(final VariantContext variant, final GATKSVVCFCo return length; } + public static List getEvidence(final VariantContext variant) { + Utils.nonNull(variant); + final List value = variant.getAttributeAsStringList(GATKSVVCFConstants.EVIDENCE, null); + if (value == null) { + return Collections.emptyList(); + } else { + return value.stream() + .filter(v -> v != null && !v.equals(MISSING_VALUE_v4)) + .map(GATKSVVCFConstants.EvidenceTypes::valueOf) + .collect(Collectors.toList()); + } + } + public static List getAlgorithms(final VariantContext variant) { Utils.nonNull(variant); Utils.validateArg(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE), "Expected " + GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE + " field for variant " + variant.getID()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java index 39228617d26..8a1f68567a5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java @@ -23,6 +23,7 @@ import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Class for collapsing a collection of similar {@link SVCallRecord} objects, such as clusters produced by @@ -79,6 +80,32 @@ public enum AltAlleleSummaryStrategy { } + /** + * Flag field logic + */ + public enum FlagFieldLogic { + /** + * Require all members to have the flag set + */ + AND, + + /** + * Require at least one member to have the flag set + */ + OR, + + /** + * Always set to false + */ + ALWAYS_FALSE + + } + + public static final Set FLAG_TYPE_INFO_FIELDS = Sets.newHashSet( + GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, + GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE + ); + private static final Set SUPPORTED_SV_TYPES = Sets.newHashSet( GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, @@ -90,6 +117,8 @@ public enum AltAlleleSummaryStrategy { GATKSVVCFConstants.StructuralVariantAnnotationType.CTX ); + private static final BreakpointEvidenceComparator breakpointEvidenceComparator = new BreakpointEvidenceComparator(); + /** * Comparators used for picking the representative genotype for a given sample */ @@ -139,16 +168,19 @@ public int compare(Genotype o1, Genotype o2) { private final AltAlleleSummaryStrategy altAlleleSummaryStrategy; private final BreakpointSummaryStrategy breakpointSummaryStrategy; + private final FlagFieldLogic flagFieldLogic; private final ReferenceSequenceFile reference; private final SAMSequenceDictionary dictionary; public CanonicalSVCollapser(final ReferenceSequenceFile reference, final AltAlleleSummaryStrategy altAlleleSummaryStrategy, - final BreakpointSummaryStrategy breakpointSummaryStrategy) { + final BreakpointSummaryStrategy breakpointSummaryStrategy, + final FlagFieldLogic flagFieldLogic) { this.reference = Utils.nonNull(reference); this.dictionary = reference.getSequenceDictionary(); this.altAlleleSummaryStrategy = altAlleleSummaryStrategy; this.breakpointSummaryStrategy = breakpointSummaryStrategy; + this.flagFieldLogic = flagFieldLogic; } private static final int distance(final SVCallRecord item, final int newStart, final int newEnd) { @@ -193,7 +225,7 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) { return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(), end, strandB, type, representative.getComplexSubtype(), representative.getComplexEventIntervals(), - length, algorithms, alleles, genotypes, attributes, filters, quality, dictionary); + length, representative.getEvidence(), algorithms, alleles, genotypes, attributes, filters, quality, dictionary); } protected List collapseAlleles(final List altAlleles, final Allele refAllele) { @@ -562,15 +594,37 @@ public static List makeBiallelicList(final Allele alt, final Allele ref, return alleles; } + private Stream getItemFlagStream(final String key, final Collection items) { + return items.stream() + .map(item ->item.getAttributes().get(key) != null && item.getAttributes().get(key).equals(Boolean.TRUE)); + } + protected Map collapseAttributes(final SVCallRecord representative, final Collection items) { Utils.nonNull(items); Utils.nonEmpty(items); final Map attributes = new HashMap<>(); for (final Map.Entry entry : representative.getAttributes().entrySet()) { - attributes.put(entry.getKey(), entry.getValue()); + if (!FLAG_TYPE_INFO_FIELDS.contains(entry.getKey())) { + attributes.put(entry.getKey(), entry.getValue()); + } } attributes.put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, items.stream().map(SVCallRecord::getId).sorted().collect(Collectors.toList())); + for (final String key : FLAG_TYPE_INFO_FIELDS) { + if (flagFieldLogic == FlagFieldLogic.AND) { + if (getItemFlagStream(key, items).allMatch(Boolean::booleanValue)) { + attributes.put(key, Boolean.TRUE); + } + } else if (flagFieldLogic == FlagFieldLogic.OR) { + if (getItemFlagStream(key, items).anyMatch(Boolean::booleanValue)) { + attributes.put(key, Boolean.TRUE); + } + } else if (flagFieldLogic == FlagFieldLogic.ALWAYS_FALSE) { + // Leave empty to imply FALSE + } else { + throw new IllegalArgumentException("Unsupported " + FlagFieldLogic.class.getSimpleName() + " value: " + flagFieldLogic.name()); + } + } return attributes; } @@ -671,16 +725,45 @@ private SVCallRecord getRepresentativeIntervalItem(final Collection qualityComparator = Comparator.comparing(r -> r.getLog10PError() == null ? 0 : r.getLog10PError()); final Comparator carrierCountComparator = Comparator.comparing(r -> -r.getCarrierGenotypeList().size()); final Comparator distanceComparator = Comparator.comparing(r -> getDistance(r.getPositionA(), r.getPositionB(), starts, ends)); - final Comparator idComparator = Comparator.comparing(r -> getDistance(r.getPositionA(), r.getPositionB(), starts, ends)); // stabilizes order + final Comparator idComparator = Comparator.comparing(SVCallRecord::getId); // stabilizes order return records.stream().min( - carrierCountComparator + qualityComparator + .thenComparing(breakpointEvidenceComparator) + .thenComparing(carrierCountComparator) .thenComparing(distanceComparator) .thenComparing(idComparator)).get(); } + /*** + * This class is for comparing evidence types for the purposes of breakpoint refinement. It prioritizes as follows: + * SR < PE < all other types. Note that SR is the "best" evidence but corresponds to the "least" value when sorting + * in ascending order. + */ + protected static class BreakpointEvidenceComparator implements Comparator { + @Override + public int compare(final SVCallRecord a, final SVCallRecord b) { + final Set evidenceA = new HashSet<>(a.getEvidence()); + final Set evidenceB = new HashSet<>(b.getEvidence()); + // SR < PE and if neither they are considered equal + // Note sorting is in ascending order, and we want the highest-priority record first + if (evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.SR) && !evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.SR)) { + return -1; + } else if (!evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.SR) && evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.SR)) { + return 1; + } else if (evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.PE) && !evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.PE)) { + return -1; + } else if (!evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.PE) && evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.PE)) { + return 1; + } else { + return 0; + } + } + } + protected static long getDistance(final int posA, final int posB, final int[] starts, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java index 0199eaae3e3..1100809f59d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java @@ -23,8 +23,6 @@ * *

NOTE: precise implementation of {@link SVClusterLinkage#getMaxClusterableStartingPosition(SVLocatable)} * is important for efficiency because it determines when a cluster can be finalized and omitted from further clustering tests.

- * - * @param class of items to cluster */ public class SVClusterEngine { @@ -41,7 +39,6 @@ public enum CLUSTERING_TYPE { private Map idToClusterMap; // Active clusters private final Map idToItemMap; // Active items protected final CLUSTERING_TYPE clusteringType; - private final ItemSortingBuffer buffer; private final Comparator itemComparator; private String currentContig; @@ -65,30 +62,12 @@ public SVClusterEngine(final CLUSTERING_TYPE clusteringType, currentContig = null; idToItemMap = new HashMap<>(); itemComparator = SVCallRecordUtils.getSVLocatableComparator(dictionary); - buffer = new ItemSortingBuffer(); nextItemId = 0; nextClusterId = 0; lastStart = 0; minActiveStartingPositionItemId = null; } - - /** - * Flushes all active clusters, adding them to the output buffer. Results from the output buffer are then copied out - * and the buffer is cleared. This should be called between contigs to save memory. - */ - public final List forceFlush() { - flushClusters(); - return buffer.forceFlush(); - } - - /** - * Gets any available finalized clusters. - */ - public final List flush() { - return buffer.flush(); - } - @VisibleForTesting public Function getCollapser() { return collapser; @@ -109,25 +88,26 @@ public SVCallRecord getMinActiveStartingPositionItem() { * Returns true if there are any active or finalized clusters. */ public final boolean isEmpty() { - return idToClusterMap.isEmpty() && buffer.isEmpty(); + return idToClusterMap.isEmpty(); } /** * Adds and clusters the given item. Note that items must be added in order of increasing start position. * @param item item to cluster */ - public final void add(final SVCallRecord item) { + public final List addAndFlush(final SVCallRecord item) { // Start a new cluster if on a new contig if (!item.getContigA().equals(currentContig)) { - flushClusters(); + final List result = flush(); currentContig = item.getContigA(); lastStart = 0; seedCluster(registerItem(item)); - return; + return result; + } else { + final int itemId = registerItem(item); + final List clusterIdsToProcess = cluster(itemId); + return processClusters(clusterIdsToProcess); } - final int itemId = registerItem(item); - final List clusterIdsToProcess = cluster(itemId); - processClusters(clusterIdsToProcess); } private final int registerItem(final SVCallRecord item) { @@ -263,12 +243,12 @@ private final void combineClusters(final Collection clusterIds, final I /** * Finalizes a single cluster, removing it from the currently active set and adding it to the output buffer. */ - private final void processCluster(final int clusterIndex) { + private final SVCallRecord processCluster(final int clusterIndex) { final Cluster cluster = getCluster(clusterIndex); idToClusterMap.remove(clusterIndex); final List clusterItemIds = cluster.getItemIds(); final OutputCluster outputCluster = new OutputCluster(clusterItemIds.stream().map(idToItemMap::get).collect(Collectors.toList())); - buffer.add(collapser.apply(outputCluster)); + final SVCallRecord result = collapser.apply(outputCluster); // Clean up item id map if (clusterItemIds.size() == 1) { // Singletons won't be present in any other clusters @@ -289,6 +269,7 @@ private final void processCluster(final int clusterIndex) { if (clusterItemIds.contains(minActiveStartingPositionItemId)) { findAndSetMinActiveStart(); } + return result; } /** @@ -309,25 +290,29 @@ private final void findAndSetMinActiveStart() { /** * Finalizes a set of clusters. */ - private final void processClusters(final List clusterIdsToProcess) { + private final List processClusters(final List clusterIdsToProcess) { + final List result = new ArrayList<>(clusterIdsToProcess.size()); for (final Integer clusterId : clusterIdsToProcess) { - processCluster(clusterId); + result.add(processCluster(clusterId)); } + return result; } /** * Finalizes all active clusters and adds them to the output buffer. Also clears the currently active set of clusters * and items. */ - private final void flushClusters() { + public final List flush() { final List clustersToFlush = new ArrayList<>(idToClusterMap.keySet()); + final List result = new ArrayList<>(clustersToFlush.size()); for (final Integer clusterId : clustersToFlush) { - processCluster(clusterId); + result.add(processCluster(clusterId)); } idToItemMap.clear(); minActiveStartingPositionItemId = null; nextItemId = 0; nextClusterId = 0; + return result; } /** @@ -431,52 +416,4 @@ public int hashCode() { return Objects.hash(itemIds); } } - - private final class ItemSortingBuffer { - private PriorityQueue buffer; - - public ItemSortingBuffer() { - Utils.nonNull(itemComparator); - this.buffer = new PriorityQueue<>(itemComparator); - } - - public void add(final SVCallRecord record) { - buffer.add(record); - } - - /** - * Returns any records that can be safely flushed based on the current minimum starting position - * of items still being actively clustered. - */ - public List flush() { - if (buffer.isEmpty()) { - return Collections.emptyList(); - } - final SVCallRecord minActiveStartItem = getMinActiveStartingPositionItem(); - if (minActiveStartItem == null) { - forceFlush(); - } - final List out = new ArrayList<>(); - while (!buffer.isEmpty() && buffer.comparator().compare(buffer.peek(), minActiveStartItem) < 0) { - out.add(buffer.poll()); - } - return out; - } - - /** - * Returns all buffered records, regardless of any active clusters. To be used only when certain that no - * active clusters can be clustered with any future inputs. - */ - public List forceFlush() { - final List result = new ArrayList<>(buffer.size()); - while (!buffer.isEmpty()) { - result.add(buffer.poll()); - } - return result; - } - - public boolean isEmpty() { - return buffer.isEmpty(); - } - } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java index 8f3bcaf6112..525910cf2d8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java @@ -25,7 +25,7 @@ public static SVClusterEngine createCanonical(final SVClusterEngine.CLUSTERING_T linkage.setDepthOnlyParams(depthParameters); linkage.setMixedParams(mixedParameters); linkage.setEvidenceParams(pesrParameters); - final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, breakpointSummaryStrategy); + final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, breakpointSummaryStrategy, CanonicalSVCollapser.FlagFieldLogic.OR); return new SVClusterEngine(type, collapser::collapse, linkage, dictionary); } @@ -35,7 +35,7 @@ public static SVClusterEngine createCNVDefragmenter(final SAMSequenceDictionary final double paddingFraction, final double minSampleOverlap) { final SVClusterLinkage linkage = new CNVLinkage(dictionary, paddingFraction, minSampleOverlap); - final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END); + final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, CanonicalSVCollapser.FlagFieldLogic.OR); return new SVClusterEngine(SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE, collapser::collapse, linkage, dictionary); } @@ -46,7 +46,7 @@ public static SVClusterEngine createBinnedCNVDefragmenter(final SAMSequenceDicti final double minSampleOverlap, final List coverageIntervals) { final SVClusterLinkage linkage = new BinnedCNVLinkage(dictionary, paddingFraction, minSampleOverlap, coverageIntervals); - final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END); + final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, CanonicalSVCollapser.FlagFieldLogic.OR); return new SVClusterEngine(SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE, collapser::collapse, linkage, dictionary); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java index 433ec4ab46e..57293b6a95b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java @@ -77,17 +77,17 @@ protected static boolean hasSampleOverlap(final SVCallRecord a, final SVCallReco final Set samples = new HashSet<>(SVUtils.hashMapCapacity(genotypesA.size() + genotypesB.size())); samples.addAll(genotypesA.getSampleNames()); samples.addAll(genotypesB.getSampleNames()); + if (samples.isEmpty()) { + // Empty case considered perfect overlap + return true; + } int numMatches = 0; for (final String sample : samples) { final Genotype genotypeA = genotypesA.get(sample); final Genotype genotypeB = genotypesB.get(sample); // If one sample doesn't exist in the other set, assume reference copy state - final int cnA = genotypeA == null ? - VariantContextGetters.getAttributeAsInt(genotypeB, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0) - : VariantContextGetters.getAttributeAsInt(genotypeA, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0); - final int cnB = genotypeB == null ? - VariantContextGetters.getAttributeAsInt(genotypeA, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0) - : VariantContextGetters.getAttributeAsInt(genotypeB, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0); + final int cnA = getCopyState(genotypeA, genotypeB); + final int cnB = getCopyState(genotypeB, genotypeA); if (cnA == cnB) { numMatches++; } @@ -105,4 +105,20 @@ protected static boolean hasSampleOverlap(final SVCallRecord a, final SVCallReco } } + /** + * Tries to get the best copy state from the genotype. If the genotype is null, uses ploidy from a "backup" + * genotype as the default. If we have no clue, just return -1 as a null default. + */ + private static int getCopyState(final Genotype genotype, final Genotype matchedSampleGenotype) { + if (genotype == null) { + if (matchedSampleGenotype != null) { + return VariantContextGetters.getAttributeAsInt(matchedSampleGenotype, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1); + } else { + throw new IllegalArgumentException("Both genotypes are null"); + } + } else { + return VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.COPY_NUMBER_FORMAT, + VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, -1)); + } + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java new file mode 100644 index 00000000000..96db9939f03 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java @@ -0,0 +1,281 @@ +package org.broadinstitute.hellbender.tools.sv.cluster; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.util.SortingCollection; +import htsjdk.variant.variantcontext.GenotypesContext; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; +import org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation; +import org.broadinstitute.hellbender.utils.reference.ReferenceUtils; + +import java.util.Set; + +import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME; +import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.FLAG_FIELD_LOGIC_LONG_NAME; + +/*** + * Base class for tools that a simple interface for utilizing {@link SVClusterEngine}. It handles input/output easily, + * including output sorting with spilling to disk to avoid excessive memory usage. + */ +public abstract class SVClusterWalker extends MultiVariantWalker { + public static final String PLOIDY_TABLE_LONG_NAME = "ploidy-table"; + public static final String VARIANT_PREFIX_LONG_NAME = "variant-prefix"; + public static final String ENABLE_CNV_LONG_NAME = "enable-cnv"; + public static final String ALGORITHM_LONG_NAME = "algorithm"; + public static final String FAST_MODE_LONG_NAME = "fast-mode"; + public static final String OMIT_MEMBERS_LONG_NAME = "omit-members"; + public static final String DEFAULT_NO_CALL_LONG_NAME = "default-no-call"; + public static final String MAX_RECORDS_IN_RAM_LONG_NAME = "max-records-in-ram"; + + /** + * The enum Cluster algorithm. + */ + public enum CLUSTER_ALGORITHM { + /** + * Defragment cnv cluster algorithm. Not supported with stratification. + */ + DEFRAGMENT_CNV, + /** + * Single linkage cluster algorithm. + */ + SINGLE_LINKAGE, + /** + * Max clique cluster algorithm. + */ + MAX_CLIQUE + } + + @Argument( + doc = "Output VCF", + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME + ) + protected GATKPath outputFile; + + /** + * Expected format is tab-delimited and contains a header with the first column SAMPLE and remaining columns + * contig names. Each row corresponds to a sample, with the sample ID in the first column and contig ploidy + * integers in their respective columns. + */ + @Argument( + doc = "Sample ploidy table (.tsv)", + fullName = PLOIDY_TABLE_LONG_NAME + ) + protected GATKPath ploidyTablePath; + + @Argument( + doc = "If supplied, generate variant IDs with this prefix", + fullName = VARIANT_PREFIX_LONG_NAME, + optional = true + ) + protected String variantPrefix = null; + + /** + * When enabled, DEL and DUP variants will be clustered together. The resulting records with have an SVTYPE of CNV. + */ + @Argument( + doc = "Enable clustering DEL/DUP variants together as CNVs (does not apply to CNV defragmentation)", + fullName = ENABLE_CNV_LONG_NAME, + optional = true + ) + protected boolean enableCnv = false; + + /** + * Results in substantial space and time costs for large sample sets by clearing genotypes that are not needed for + * clustering, but any associated annotation fields will be set to null in the output. + */ + @Argument( + doc = "Fast mode. Drops hom-ref and missing genotype fields and emits them as missing.", + fullName = FAST_MODE_LONG_NAME, + optional = true + ) + protected boolean fastMode = false; + + @Argument( + doc = "Omit cluster member ID annotations", + fullName = OMIT_MEMBERS_LONG_NAME, + optional = true + ) + protected boolean omitMembers = false; + + @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME, + doc = "Strategy to use for choosing a representative value for a breakpoint cluster.", + optional = true) + protected CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy = + CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE; + + @Argument(fullName = JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME, + doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with " + + "different subtypes.", + optional = true) + protected CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy = + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE; + + @Argument(fullName = FLAG_FIELD_LOGIC_LONG_NAME, + doc = "Logic for collapsing Flag type INFO and FORMAT fields", + optional = true) + protected CanonicalSVCollapser.FlagFieldLogic flagFieldLogic = CanonicalSVCollapser.FlagFieldLogic.OR; + + @Argument(fullName = ALGORITHM_LONG_NAME, + doc = "Clustering algorithm", + optional = true + ) + protected CLUSTER_ALGORITHM algorithm = CLUSTER_ALGORITHM.SINGLE_LINKAGE; + + /** + * Default genotypes are assigned when they cannot be inferred from the inputs, such as when VCFs with different + * variants and samples are provided. + */ + @Argument(fullName = DEFAULT_NO_CALL_LONG_NAME, + doc = "Default to no-call GT (e.g. ./.) instead of reference alleles (e.g. 0/0) when a genotype is not" + + " available", + optional = true + ) + protected boolean defaultNoCall = false; + + @Argument(fullName = MAX_RECORDS_IN_RAM_LONG_NAME, + doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " + + "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " + + "VCF file, and increases the amount of RAM needed.", + optional=true) + public int maxRecordsInRam = 10000; + + protected SAMSequenceDictionary dictionary; + protected ReferenceSequenceFile reference; + protected PloidyTable ploidyTable; + protected SortingCollection sortingBuffer; + protected VariantContextWriter writer; + protected VCFHeader header; + protected Set samples; + protected String currentContig; + protected int numVariantsBuilt = 0; + + @Override + public boolean requiresReference() { + return true; + } + + @Override + public void onTraversalStart() { + reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier()); + dictionary = reference.getSequenceDictionary(); + if (dictionary == null) { + throw new UserException("Reference sequence dictionary required"); + } + ploidyTable = new PloidyTable(ploidyTablePath.toPath()); + samples = getSamplesForVariants(); + writer = createVCFWriter(outputFile); + header = createHeader(); + writer.writeHeader(header); + currentContig = null; + sortingBuffer = SortingCollection.newInstance( + VariantContext.class, + new VCFRecordCodec(header, true), + header.getVCFRecordComparator(), + maxRecordsInRam, + tmpDir.toPath()); + } + + @Override + public Object onTraversalSuccess() { + for (final VariantContext variant : sortingBuffer) { + writer.add(variant); + } + return super.onTraversalSuccess(); + } + + @Override + public void closeTool() { + super.closeTool(); + if (sortingBuffer != null) { + sortingBuffer.cleanup(); + } + if (writer != null) { + writer.close(); + } + } + + /** + * Subclasses should override this method + */ + public abstract void applyRecord(final SVCallRecord record); + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext) { + SVCallRecord call = SVCallRecordUtils.create(variant, dictionary); + if (fastMode && call.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) { + // Strip out non-carrier genotypes to save memory and compute + // Don't do for multi-allelic CNVs since carrier status can't be determined + final GenotypesContext filteredGenotypes = GenotypesContext.copy(call.getCarrierGenotypeList()); + call = SVCallRecordUtils.copyCallWithNewGenotypes(call, filteredGenotypes); + } + // Update current contig + if (!call.getContigA().equals(currentContig)) { + currentContig = call.getContigA(); + logger.info("Processing contig " + currentContig + "..."); + } + applyRecord(call); + } + + protected VCFHeader createHeader() { + final VCFHeader header = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples); + header.setSequenceDictionary(dictionary); + + // Required info lines + header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN)); + header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE)); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.END2_ATTRIBUTE, 1, + VCFHeaderLineType.Integer, "Second position")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, 1, + VCFHeaderLineType.String, "Second contig")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRANDS_ATTRIBUTE, 1, + VCFHeaderLineType.String, "First and second strands")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, + VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Source algorithms")); + if (!omitMembers) { + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, + VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Cluster variant ids")); + } + // Required format lines + header.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); + return header; + } + + protected void write(final SVCallRecord call) { + sortingBuffer.add(buildVariantContext(call)); + } + + protected VariantContext buildVariantContext(final SVCallRecord call) { + // Add genotypes for missing samples + final GenotypesContext filledGenotypes = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles( + call, samples, !defaultNoCall, ploidyTable, header); + + // Assign new variant ID + final String newId = variantPrefix == null ? call.getId() : String.format("%s%08x", variantPrefix, numVariantsBuilt++); + + // Build new variant + final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(), + call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(), + call.getComplexEventIntervals(), call.getLength(), call.getEvidence(), call.getAlgorithms(), call.getAlleles(), filledGenotypes, + call.getAttributes(), call.getFilters(), call.getLog10PError(), dictionary); + final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall); + if (omitMembers) { + builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY); + } + return builder.make(); + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java new file mode 100644 index 00000000000..2bc5720393b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java @@ -0,0 +1,44 @@ +package org.broadinstitute.hellbender.tools.sv.cluster; + +import com.google.common.collect.ImmutableSet; +import org.broadinstitute.hellbender.utils.tsv.DataLine; +import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; + +import java.util.Set; +import java.util.function.Function; + +public class StratifiedClusteringTableParser { + + // Configuration table column names + public static final String NAME_COLUMN = "NAME"; + public static final String RECIPROCAL_OVERLAP_COLUMN = "RECIPROCAL_OVERLAP"; + public static final String SIZE_SIMILARITY_COLUMN = "SIZE_SIMILARITY"; + public static final String BREAKEND_WINDOW_COLUMN = "BREAKEND_WINDOW"; + public static final String SAMPLE_OVERLAP_COLUMN = "SAMPLE_OVERLAP"; + protected static final Set COLUMN_NAMES = ImmutableSet.of(NAME_COLUMN, RECIPROCAL_OVERLAP_COLUMN, SIZE_SIMILARITY_COLUMN, BREAKEND_WINDOW_COLUMN, SAMPLE_OVERLAP_COLUMN); + + public static Function tableParser(TableColumnCollection columns, Function exceptionFactory) { + for (final String column : COLUMN_NAMES) { + if (!columns.contains(column)) { + throw exceptionFactory.apply("Missing column " + column); + } + } + if (columns.columnCount() != COLUMN_NAMES.size()) { + throw exceptionFactory.apply("Expected " + columns.columnCount() + " columns but found " + columns.columnCount()); + } + return StratifiedClusteringTableParser::parseTableLine; + } + + protected static StratumParameters parseTableLine(final DataLine dataLine) { + final String name = dataLine.get(NAME_COLUMN); + final double reciprocalOverlap = dataLine.getDouble(RECIPROCAL_OVERLAP_COLUMN); + final double sizeSimilarity = dataLine.getDouble(SIZE_SIMILARITY_COLUMN); + final double sampleOverlap = dataLine.getDouble(SAMPLE_OVERLAP_COLUMN); + final int breakendWindow = dataLine.getInt(BREAKEND_WINDOW_COLUMN); + return new StratumParameters(name, reciprocalOverlap, sizeSimilarity, breakendWindow, sampleOverlap); + } + + public record StratumParameters(String name, double reciprocalOverlap, double sizeSimilarity, + int breakendWindow, double sampleOverlap) { + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java new file mode 100644 index 00000000000..9083aa83886 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java @@ -0,0 +1,349 @@ +package org.broadinstitute.hellbender.tools.sv.stratify; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.util.Locatable; +import htsjdk.samtools.util.OverlapDetector; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.utils.IntervalMergingRule; +import org.broadinstitute.hellbender.utils.IntervalUtils; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.tsv.DataLine; +import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; +import org.broadinstitute.hellbender.utils.tsv.TableReader; +import org.broadinstitute.hellbender.utils.tsv.TableUtils; + +import java.io.IOException; +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collectors; + +// Groups variants by SVTYPE, SVLEN, and overlap with one or more interval sets +public class SVStatificationEngine { + + // Configuration table column names + public static final String NAME_COLUMN = "NAME"; + public static final String SVTYPE_COLUMN = "SVTYPE"; + public static final String MIN_SIZE_COLUMN = "MIN_SIZE"; + public static final String MAX_SIZE_COLUMN = "MAX_SIZE"; + public static final String TRACK_COLUMN = "TRACKS"; + protected static final Set COLUMN_NAMES = ImmutableSet.of(NAME_COLUMN, SVTYPE_COLUMN, MIN_SIZE_COLUMN, MAX_SIZE_COLUMN, TRACK_COLUMN); + public static final String TRACK_COLUMN_DELIMITER = ","; + + public static final Set NULL_TABLE_VALUES = Set.of("-1", "", "NULL", "NA"); + + protected final Map> trackMap; + protected final Map strata; + protected final SAMSequenceDictionary dictionary; + + public SVStatificationEngine(final SAMSequenceDictionary dictionary) { + trackMap = new HashMap<>(); + strata = new HashMap<>(); + this.dictionary = Utils.nonNull(dictionary); + } + + public void addTrack(final String name, final List intervals) { + Utils.nonNull(name); + Utils.nonNull(intervals); + Utils.validateArg(!trackMap.containsKey(name), "Track with name " + name + " already exists"); + trackMap.put(name, OverlapDetector.create(intervals)); + } + + /** + * Adds a new stratification group + * @param name a unique ID + * @param svType SV type, may be null + * @param minSize minimum size in bp (inclusive), may be null + * @param maxSize maximum size in bp (exclusive), may be null + * @param trackNames reference track names + */ + public void addStratification(final String name, final GATKSVVCFConstants.StructuralVariantAnnotationType svType, + final Integer minSize, final Integer maxSize, final Set trackNames) { + addStratification(new Stratum(name, svType, minSize, maxSize, trackNames)); + } + + protected void addStratification(final Stratum stratification) { + if (strata.containsKey(stratification.getName())) { + throw new GATKException("Encountered duplicate name " + stratification.getName()); + } + strata.put(stratification.getName(), stratification); + } + + /** + * Retrieves intervals for the given track + * @param name track ID + * @return searchable interval set + */ + public OverlapDetector getTrackIntervals(final String name) { + return trackMap.get(name); + } + + /** + * Factory method for creating a new engine from a config file and set of reference tracks. The config file + * is a table parsable by {@link TableReader}, with mandatory columns defined in {@link #COLUMN_NAMES}. + * @param trackMap map from reference track name to interval set + * @param configFilePath path to stratification config table + * @param dictionary reference dict + * @return new engine + */ + public static SVStatificationEngine create(final Map> trackMap, + final GATKPath configFilePath, + final SAMSequenceDictionary dictionary) { + Utils.nonNull(trackMap); + Utils.nonNull(configFilePath); + final SVStatificationEngine engine = new SVStatificationEngine(dictionary); + for (final Map.Entry> entry : trackMap.entrySet()) { + engine.addTrack(entry.getKey(), entry.getValue()); + } + try (final TableReader tableReader = TableUtils.reader(configFilePath.toPath(), engine::tableParser)) { + for (final Stratum stratification : tableReader) { + engine.addStratification(stratification); + } + } catch (final IOException e) { + throw new GATKException("IO error while reading config table", e); + } + return engine; + } + + /** + * Get all stratification groups matching a given query record. + * @param record query record + * @param overlapFraction minimum overlap fraction (0 to 1) + * @param numBreakpointOverlaps minimum number of breakpoint ends that must lie in the reference track(s) (0, 1, 2) + * @param numBreakpointOverlapsInterchrom minimum breakpoint ends for interchromosomal variants (1, 2) + * @return all matching strata + */ + public Collection getMatches(final SVCallRecord record, final double overlapFraction, final int numBreakpointOverlaps, final int numBreakpointOverlapsInterchrom) { + Utils.nonNull(record); + final List result = new ArrayList<>(); + for (final Stratum stratification : strata.values()) { + if (stratification.matches(record, overlapFraction, numBreakpointOverlaps, numBreakpointOverlapsInterchrom)) { + result.add(stratification); + } + } + return result; + } + + protected Function tableParser(TableColumnCollection columns, Function exceptionFactory) { + // Check for expected columns + for (final String column : COLUMN_NAMES) { + if (!columns.contains(column)) { + throw exceptionFactory.apply("Missing column " + column); + } + } + // Check there are no extra columns + if (columns.columnCount() != COLUMN_NAMES.size()) { + throw exceptionFactory.apply("Expected " + columns.columnCount() + " columns but found " + columns.columnCount()); + } + return this::parseTableLine; + } + + protected Stratum parseTableLine(final DataLine dataLine) { + final GATKSVVCFConstants.StructuralVariantAnnotationType svType = GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(dataLine.get(SVTYPE_COLUMN)); + final String name = dataLine.get(NAME_COLUMN); + final Integer minSize = parseIntegerMaybeNull(dataLine.get(MIN_SIZE_COLUMN)); + final Integer maxSize = parseIntegerMaybeNull(dataLine.get(MAX_SIZE_COLUMN)); + final Set trackNames = parseTrackString(dataLine.get(TRACK_COLUMN)); + return new Stratum(name, svType, minSize, maxSize, trackNames); + } + + protected Set parseTrackString(final String val) { + if (NULL_TABLE_VALUES.contains(val)) { + return Collections.emptySet(); + } else { + final String[] trackArray = val.split(TRACK_COLUMN_DELIMITER); + for (final String track : trackArray) { + if (!trackMap.containsKey(track)) { + throw new GATKException("Could not find track with name " + track); + } + } + return Lists.newArrayList(trackArray).stream().collect(Collectors.toUnmodifiableSet()); + } + } + + protected Integer parseIntegerMaybeNull(final String val) { + if (NULL_TABLE_VALUES.contains(val)) { + return null; + } else { + return Integer.valueOf(val); + } + } + + public Collection getStrata() { + return strata.values(); + } + + public class Stratum { + + final GATKSVVCFConstants.StructuralVariantAnnotationType svType; + final int minSize; // inclusive + final int maxSize; // exclusive + final List trackNames; + final String name; + + Stratum(final String name, final GATKSVVCFConstants.StructuralVariantAnnotationType svType, + final Integer minSize, final Integer maxSize, final Set trackNames) { + this.name = Utils.nonNull(name); + for (final String trackName : trackNames) { + if (trackName != null && !trackMap.containsKey(trackName)) { + throw new IllegalArgumentException("Unregistered track name " + trackName); + } + } + if (maxSize != null && minSize != null && maxSize <= minSize) { + throw new IllegalArgumentException("Min size must be strictly less than max size"); + } + if (maxSize != null && maxSize < 0) { + throw new IllegalArgumentException("Max size cannot be less than 0"); + } + if (maxSize != null && maxSize == Integer.MAX_VALUE) { + throw new IllegalArgumentException("Max size " + Integer.MAX_VALUE + " is reserved"); + } + if (minSize != null && minSize < 0) { + throw new IllegalArgumentException("Min size cannot be less than 0"); + } + if ((svType == GATKSVVCFConstants.StructuralVariantAnnotationType.BND || svType == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) && (minSize != null || maxSize != null)) { + throw new IllegalArgumentException("BND/CTX categories cannot have min or max size (" + name + ")"); + } + this.svType = svType; + // Map min from any negative number to negative infinity + if (minSize == null) { + this.minSize = Integer.MIN_VALUE; + } else { + this.minSize = minSize; + } + // Map max from any negative number to infinity + if (maxSize == null) { + this.maxSize = Integer.MAX_VALUE; + } else { + this.maxSize = maxSize; + } + this.trackNames = trackNames.stream().sorted().collect(Collectors.toList()); + } + + protected boolean matches(final SVCallRecord record, final double overlapFraction, + final int numBreakpointOverlaps, final int numBreakpointOverlapsInterchrom) { + return matchesType(record) && matchesSize(record) && matchesTracks(record, overlapFraction, numBreakpointOverlaps, numBreakpointOverlapsInterchrom); + } + + protected boolean matchesType(final SVCallRecord record) { + return record.getType() == svType; + } + + protected boolean matchesSize(final SVCallRecord record) { + final Integer length = record.getLength(); + if (length == null) { + // Undefined length requires null min/max boundaries + return minSize == Integer.MIN_VALUE && maxSize == Integer.MAX_VALUE; + } else { + return length >= minSize && length < maxSize; + } + } + + /** + * Determines whether a given query record belongs to this track. + * @param record query record + * @param overlapFraction minimum variant interval overlap fraction + * @param numBreakpointOverlaps minimum number of breakpoint ends that must lie in the track + * @param numBreakpointOverlapsInterchrom minimum breakpoint ends if the variant is interchromosomal + * @return true if the SV matches the tracks of this stratum + */ + public boolean matchesTracks(final SVCallRecord record, + final double overlapFraction, + final int numBreakpointOverlaps, + final int numBreakpointOverlapsInterchrom) { + Utils.nonNull(record); + Utils.validate(overlapFraction >= 0 && overlapFraction <= 1, + "Overlap fraction threshold " + overlapFraction + " must be on [0, 1]"); + Utils.validate(numBreakpointOverlaps >= 0 && numBreakpointOverlaps <= 2, + "Breakpoint overlaps threshold " + numBreakpointOverlaps + " must be 0, 1, or 2"); + Utils.validate(numBreakpointOverlapsInterchrom == 1 || numBreakpointOverlapsInterchrom == 2, + "Interchromosomal breakpoint overlaps threshold " + numBreakpointOverlapsInterchrom + " must be 1 or 2"); + Utils.validate(!(overlapFraction == 0 && numBreakpointOverlaps == 0), + "Overlap fraction and overlapping breakpoints thresholds cannot both be 0"); + if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) { + // Just require the insertion locus to fall in an interval + return matchesTrackBreakpointOverlap(record, 1); + } else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND || record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) { + // Interchromosomal variants + return matchesTrackBreakpointOverlap(record, numBreakpointOverlapsInterchrom); + } else { + return matchesTrackIntrachromosomal(record, overlapFraction, numBreakpointOverlaps); + } + } + + protected boolean matchesTrackIntrachromosomal(final SVCallRecord record, + final double overlapFraction, + final int numBreakpointOverlaps) { + return matchesTrackOverlapFraction(record, overlapFraction) && matchesTrackBreakpointOverlap(record, numBreakpointOverlaps); + } + + protected boolean matchesTrackOverlapFraction(final SVCallRecord record, final double overlapFraction) { + if (overlapFraction > 0 && !trackNames.isEmpty()) { + if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) { + throw new GATKException("Track overlap for CPX types not currently supported (" + name + ")"); + } + final SimpleInterval interval = new SimpleInterval(record.getContigA(), record.getPositionA(), record.getPositionB()); + final List overlaps = new ArrayList<>(); + for (final String track : trackNames) { + overlaps.addAll(trackMap.get(track).getOverlaps(interval).stream().map(SimpleInterval::new).collect(Collectors.toList())); + } + final List mergedOverlaps = IntervalUtils.sortAndMergeIntervals(overlaps, dictionary, IntervalMergingRule.ALL) + .values().stream().flatMap(List::stream).collect(Collectors.toList()); + long overlapLength = 0; + for (final Locatable overlap : mergedOverlaps) { + overlapLength += interval.intersect(overlap).size(); + } + return overlapLength / (double) interval.getLengthOnReference() >= overlapFraction; + } else { + return true; + } + } + + protected boolean matchesTrackBreakpointOverlap(final SVCallRecord record, final int numBreakpointOverlaps) { + if (numBreakpointOverlaps > 0 && !trackNames.isEmpty()) { + if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) { + throw new GATKException("Track overlap for CPX types not currently supported (" + name + ")"); + } + final SimpleInterval intervalA = new SimpleInterval(record.getContigA(), record.getPositionA(), record.getPositionA()); + final SimpleInterval intervalB = new SimpleInterval(record.getContigB(), record.getPositionB(), record.getPositionB()); + return countAnyTrackOverlap(intervalA) + countAnyTrackOverlap(intervalB) >= numBreakpointOverlaps; + } else { + return true; + } + } + + protected int countAnyTrackOverlap(final SimpleInterval interval) { + for (final String track : trackNames) { + if (trackMap.get(track).overlapsAny(interval)) { + return 1; + } + } + return 0; + } + + public GATKSVVCFConstants.StructuralVariantAnnotationType getSvType() { + return svType; + } + + public Integer getMinSize() { + return minSize; + } + + public Integer getMaxSize() { + return maxSize; + } + + public List getTrackNames() { + return trackNames; + } + + public String getName() { + return name; + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java new file mode 100644 index 00000000000..2fe8ebaabe4 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java @@ -0,0 +1,70 @@ +package org.broadinstitute.hellbender.tools.sv.stratify; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.utils.tsv.TableUtils; + +import java.io.Serializable; +import java.util.List; + +/** + * Arguments for use with {@link SVStatificationEngine}. + */ +public class SVStratificationEngineArgumentsCollection implements Serializable { + // Command-line arguments + public static final String STRATIFY_CONFIG_FILE_LONG_NAME = "stratify-config"; + public static final String TRACK_NAME_FILE_LONG_NAME = "track-name"; + public static final String TRACK_INTERVAL_FILE_LONG_NAME = "track-intervals"; + public static final String OVERLAP_FRACTION_LONG_NAME = "stratify-overlap-fraction"; + public static final String NUM_BREAKPOINT_OVERLAPS_LONG_NAME = "stratify-num-breakpoint-overlaps"; + public static final String NUM_BREAKPOINT_INTERCHROM_OVERLAPS_LONG_NAME = "stratify-num-breakpoint-overlaps-interchromosomal"; + private static final long serialVersionUID = 1L; + + /** + * Expected format is tab-delimited and contains columns NAME, SVTYPE, MIN_SIZE, MAX_SIZE, track. First line must + * be a header with column names. Comment lines starting with {@link TableUtils#COMMENT_PREFIX} are ignored. + */ + @Argument( + doc = "Stratification configuration file (.tsv)", + fullName = STRATIFY_CONFIG_FILE_LONG_NAME + ) + public GATKPath configFile; + + @Argument( + doc = "Track intervals file. Can be specified multiple times.", + fullName = TRACK_INTERVAL_FILE_LONG_NAME, + optional = true + ) + public List trackFileList; + + @Argument( + doc = "Track names. Must be once for each --" + TRACK_INTERVAL_FILE_LONG_NAME, + fullName = TRACK_NAME_FILE_LONG_NAME, + optional = true + ) + public List trackNameList; + + @Argument( + doc = "Minimum overlap fraction for tracks", + minValue = 0, + maxValue = 1, + fullName = OVERLAP_FRACTION_LONG_NAME + ) + public double overlapFraction = 0; + + @Argument( + doc = "Minimum number of variant endpoint overlaps for tracks", + minValue = 0, + maxValue = 2, + fullName = NUM_BREAKPOINT_OVERLAPS_LONG_NAME + ) + public int numBreakpointOverlaps = 1; + + @Argument( + doc = "Minimum number of breakpoint overlaps for tracks for interchromosomal variants (e.g. BNDs)", + minValue = 1, + maxValue = 2, + fullName = NUM_BREAKPOINT_INTERCHROM_OVERLAPS_LONG_NAME + ) + public int numBreakpointOverlapsInterchrom = 1; +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java new file mode 100644 index 00000000000..46b8569c379 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java @@ -0,0 +1,243 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.vcf.VCFHeader; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.tools.sv.cluster.*; +import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine; +import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.tsv.TableReader; +import org.broadinstitute.hellbender.utils.tsv.TableUtils; + +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +/** + *

Clusters structural variants using the same base algorithms as {@link SVCluster}. In addition, variants are + * grouped according to customizable stratification criteria including: + *

    + *
  • SV type
  • + *
  • Size range
  • + *
  • Reference track overlap
  • + *
+ * The first step is to define these groups in a stratification configuration TSV file. Please see the + * {@link SVStratify} tool for a description of the stratification method and expected table format. + * + *

Each SV is only clustered with other SVs in its own group. Each group must be mutually exclusive, meaning that + * any given SV should only belong to one group. Furthermore, SVs that do not fall into any of the groups will not be + * clustered.

+ * + *

The second step is to define the clustering configuration for each group. This is again done by creating a TSV + * file with the following columns defined on the first line: + *

    + *
  1. NAME
  2. + *
  3. RECIPROCAL_OVERLAP
  4. + *
  5. SIZE_SIMILARITY
  6. + *
  7. BREAKEND_WINDOW
  8. + *
  9. SAMPLE_OVERLAP
  10. + *
+ * where NAME corresponds to the same name given in the stratification configuration. Every group needs to be given + * a configuration here. That is, there should be a 1:1 correspondence of the rows in the two configuration files + * (order does not matter). + *

+ * + *

The remaining columns define the clustering parameters for the group. See {@link SVCluster} for more information + * on the different parameters. Note that, unlike {@link SVCluster}, distinct parameter sets for depth-only, + * PESR, and "mixed" clustering cannot be defined for this tool. Instead, the same parameters are applied to + * all three cases.

+ * + *

For example,

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
NAMERECIPROCAL_OVERLAPSIZE_SIMILARITYBREAKEND_WINDOWSAMPLE_OVERLAP
DEL_large_SD0.30.510000000.1
DUP_large_SD0.30.510000000.1
DEL_small_SR_RM0.50.51000.1
DUP_small_SR_RM0.50.51000.1
INS_SR0.50.51000
+ * + *

This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.

+ * + *

This tool does not support CNV defragmentation via the {@link #algorithm} parameter.

+ * + *

Inputs

+ * + *
    + *
  • + * One or more SV VCFs + *
  • + *
  • + * Stratification configuration TSV file + *
  • + *
  • + * Clustering configuration TSV file + *
  • + *
  • + * Reference fasta + *
  • + *
+ * + *

Output

+ * + *
    + *
  • + * Clustered VCF + *
  • + *
+ * + *

Usage example

+ * + *
+ *     gatk GroupedSVCluster \
+ *       -R reference.fasta \
+ *       -V variants.vcf.gz \
+ *       -O clustered.vcf.gz \
+ *       --track-name repeatmasker \
+ *       --track-intervals repeatmasker.bed \
+ *       --stratify-config strata.tsv \
+ *       --clustering-config cluster.tsv
+ * 
+ * + * @author Mark Walker <markw@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Clusters structural variants within independent stratification groups", + oneLineSummary = "Clusters structural variants grouping by type, size, and track overlap", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public final class GroupedSVCluster extends SVClusterWalker { + public static final String CLUSTERING_CONFIG_FILE_LONG_NAME = "clustering-config"; + + @ArgumentCollection + private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection(); + + /** + * Expected format is tab-delimited and contains columns NAME, RECIPROCAL_OVERLAP, SIZE_SIMILARITY, BREAKEND_WINDOW, + * SAMPLE_OVERLAP. First line must be a header with column names. Comment lines starting with + * {@link TableUtils#COMMENT_PREFIX} are ignored. + */ + @Argument( + doc = "Configuration file (.tsv) containing the clustering parameters for each group", + fullName = CLUSTERING_CONFIG_FILE_LONG_NAME + ) + public GATKPath strataClusteringConfigFile; + + private SVStatificationEngine stratEngine; + private final Map clusterEngineMap = new HashMap<>(); + + @Override + public void onTraversalStart() { + super.onTraversalStart(); + // sorting not guaranteed + createOutputVariantIndex = false; + stratEngine = SVStratify.loadStratificationConfig(stratArgs, dictionary); + Utils.validate(!stratEngine.getStrata().isEmpty(), + "No strata defined with --" + SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME); + readStrataClusteringConfig(); + Utils.validate(stratEngine.getStrata().size() == clusterEngineMap.size(), + "Stratification and clustering configurations have a different number of groups."); + for (final SVStatificationEngine.Stratum stratum : stratEngine.getStrata()) { + Utils.validate(clusterEngineMap.containsKey(stratum.getName()), + "Could not find group " + stratum.getName() + " in clustering configuration."); + } + } + + @Override + protected VCFHeader createHeader() { + final VCFHeader header = super.createHeader(); + SVStratify.addStratifyMetadata(header); + return header; + } + + private void readStrataClusteringConfig() { + try (final TableReader tableReader = TableUtils.reader(strataClusteringConfigFile.toPath(), StratifiedClusteringTableParser::tableParser)) { + for (final StratifiedClusteringTableParser.StratumParameters parameters : tableReader) { + // Identical parameters for each linkage type + final ClusteringParameters pesrParams = ClusteringParameters.createPesrParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap()); + final ClusteringParameters mixedParams = ClusteringParameters.createMixedParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap()); + final ClusteringParameters depthParams = ClusteringParameters.createDepthParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap()); + final SVClusterEngine clusterEngine = createClusteringEngine(pesrParams, mixedParams, depthParams); + clusterEngineMap.put(parameters.name(), clusterEngine); + } + } catch (final IOException e) { + throw new GATKException("IO error while reading config table", e); + } + } + + private SVClusterEngine createClusteringEngine(final ClusteringParameters pesrParams, final ClusteringParameters mixedParams, final ClusteringParameters depthParams) { + if (algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE || algorithm == CLUSTER_ALGORITHM.MAX_CLIQUE) { + final SVClusterEngine.CLUSTERING_TYPE type = algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE ? + SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE : SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE; + return SVClusterEngineFactory.createCanonical(type, breakpointSummaryStrategy, + altAlleleSummaryStrategy, dictionary, reference, enableCnv, + depthParams, mixedParams, pesrParams); + } else { + throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name()); + } + } + + @Override + public Object onTraversalSuccess() { + for (final SVClusterEngine engine : clusterEngineMap.values()) { + engine.flush().stream().forEach(this::write); + } + return super.onTraversalSuccess(); + } + + @Override + public void closeTool() { + super.closeTool(); + } + + @Override + public void applyRecord(final SVCallRecord record) { + final Collection stratifications = stratEngine.getMatches(record, + stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom); + if (stratifications.size() > 1) { + // don't allow more than one match since it would proliferate variants + final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList())); + throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + + ". Groups must be mutually exclusive. Please modify the group configurations and/or tracks so that " + + "no variant can match more than one group."); + } else if (stratifications.isEmpty()) { + // no match, don't cluster + record.getAttributes().put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList(record.getId())); + record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(SVStratify.DEFAULT_STRATUM)); + write(record); + } else { + // exactly one match + final SVStatificationEngine.Stratum stratum = stratifications.iterator().next(); + Utils.validate(clusterEngineMap.containsKey(stratum.getName()), "Group undefined: " + stratum.getName()); + record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(stratum.getName())); + clusterAndWrite(record, clusterEngineMap.get(stratum.getName())); + } + } + + private void clusterAndWrite(final SVCallRecord record, final SVClusterEngine clusterEngine) { + clusterEngine.addAndFlush(record).stream().forEach(this::write); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java index e447d2c1086..473d5a0f3f6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java @@ -105,6 +105,8 @@ public class JointGermlineCNVSegmentation extends MultiVariantWalkerGroupedOnSta private SampleDB sampleDB; private boolean isMultiSampleInput = false; private ReferenceSequenceFile reference; + private Collection defragmentBuffer; + private Collection outputBuffer; private final Set allosomalContigs = new LinkedHashSet<>(Arrays.asList("X","Y","chrX","chrY")); class CopyNumberAndEndRecord { @@ -132,6 +134,7 @@ public int getEndPosition() { public static final String MODEL_CALL_INTERVALS_LONG_NAME = "model-call-intervals"; public static final String BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME = "breakpoint-summary-strategy"; public static final String ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME = "alt-allele-summary-strategy"; + public static final String FLAG_FIELD_LOGIC_LONG_NAME = "flag-field-logic"; @Argument(fullName = MIN_QUALITY_LONG_NAME, doc = "Minimum QS score to combine a variant segment", optional = true) private int minQS = 20; @@ -200,6 +203,13 @@ public boolean requiresReference() { // Cannot require sample overlap when clustering across samples private static final double CLUSTER_SAMPLE_OVERLAP_FRACTION = 0; + @Argument(fullName = SVClusterWalker.MAX_RECORDS_IN_RAM_LONG_NAME, + doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " + + "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " + + "VCF file, and increases the amount of RAM needed.", + optional=true) + public int maxRecordsInRam = 10000; + @Override public void onTraversalStart() { reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier()); @@ -223,6 +233,8 @@ public void onTraversalStart() { clusterEngine = SVClusterEngineFactory.createCanonical(SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE, breakpointSummaryStrategy, altAlleleSummaryStrategy, dictionary, reference, true, clusterArgs, CanonicalSVLinkage.DEFAULT_MIXED_PARAMS, CanonicalSVLinkage.DEFAULT_PESR_PARAMS); + defragmentBuffer = new ArrayList<>(); + outputBuffer = new ArrayList<>(); vcfWriter = getVCFWriter(); if (getSamplesForVariants().size() != 1) { @@ -285,14 +297,38 @@ public void apply(final List variantContexts, final ReferenceCon final SVCallRecord record = createDepthOnlyFromGCNVWithOriginalGenotypes(vc, minQS, allosomalContigs, refAutosomalCopyNumber, sampleDB); if (record != null) { if (!isMultiSampleInput) { - defragmenter.add(record); + bufferDefragmenterOutput(defragmenter.addAndFlush(record)); } else { - clusterEngine.add(record); + bufferClusterOutput(clusterEngine.addAndFlush(record)); } } } } + private void bufferDefragmenterOutput(final List records) { + defragmentBuffer.addAll(records); + } + + private List flushDefragmenterBuffer() { + final List result = defragmentBuffer.stream() + .sorted(Comparator.comparingInt(SVCallRecord::getPositionA)) + .collect(Collectors.toUnmodifiableList()); + defragmentBuffer = new ArrayList<>(); + return result; + } + + private void bufferClusterOutput(final List records) { + outputBuffer.addAll(records); + } + + private List flushClusterBuffer() { + final List result = outputBuffer.stream() + .sorted(Comparator.comparingInt(SVCallRecord::getPositionA)) + .collect(Collectors.toUnmodifiableList()); + outputBuffer = new ArrayList<>(); + return result; + } + @Override public Object onTraversalSuccess() { processClusters(); @@ -305,11 +341,16 @@ public Object onTraversalSuccess() { * new contig. */ private void processClusters() { - final List defragmentedCalls = defragmenter.forceFlush(); - defragmentedCalls.stream().forEachOrdered(clusterEngine::add); + bufferDefragmenterOutput(defragmenter.flush()); //Jack and Isaac cluster first and then defragment - final List clusteredCalls = clusterEngine.forceFlush(); - write(clusteredCalls); + bufferClusterOutput( + flushDefragmenterBuffer().stream() + .map(clusterEngine::addAndFlush) + .flatMap(List::stream) + .collect(Collectors.toList()) + ); + bufferClusterOutput(clusterEngine.flush()); + write(flushClusterBuffer()); } private VariantContext buildAndSanitizeRecord(final SVCallRecord record) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java index 791befb79fb..cd280857a63 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java @@ -1,32 +1,13 @@ package org.broadinstitute.hellbender.tools.walkers.sv; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import htsjdk.variant.variantcontext.GenotypesContext; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.ArgumentCollection; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; -import org.broadinstitute.hellbender.engine.*; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines; import org.broadinstitute.hellbender.tools.sv.SVCallRecord; -import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; import org.broadinstitute.hellbender.tools.sv.cluster.*; -import org.broadinstitute.hellbender.utils.reference.ReferenceUtils; - -import java.util.List; -import java.util.Set; - -import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME; /** *

Clusters structural variants based on coordinates, event type, and supporting algorithms. Primary use cases include:

@@ -178,111 +159,10 @@ ) @BetaFeature @DocumentedFeature -public final class SVCluster extends MultiVariantWalker { - public static final String PLOIDY_TABLE_LONG_NAME = "ploidy-table"; - public static final String VARIANT_PREFIX_LONG_NAME = "variant-prefix"; - public static final String ENABLE_CNV_LONG_NAME = "enable-cnv"; +public final class SVCluster extends SVClusterWalker { + public static final String DEFRAG_PADDING_FRACTION_LONG_NAME = "defrag-padding-fraction"; public static final String DEFRAG_SAMPLE_OVERLAP_LONG_NAME = "defrag-sample-overlap"; - public static final String CONVERT_INV_LONG_NAME = "convert-inv-to-bnd"; - public static final String ALGORITHM_LONG_NAME = "algorithm"; - public static final String FAST_MODE_LONG_NAME = "fast-mode"; - public static final String OMIT_MEMBERS_LONG_NAME = "omit-members"; - public static final String DEFAULT_NO_CALL_LONG_NAME = "default-no-call"; - - /** - * The enum Cluster algorithm. - */ - enum CLUSTER_ALGORITHM { - /** - * Defragment cnv cluster algorithm. - */ - DEFRAGMENT_CNV, - /** - * Single linkage cluster algorithm. - */ - SINGLE_LINKAGE, - /** - * Max clique cluster algorithm. - */ - MAX_CLIQUE - } - - @Argument( - doc = "Output VCF", - fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, - shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME - ) - private GATKPath outputFile; - - /** - * Expected format is tab-delimited and contains a header with the first column SAMPLE and remaining columns - * contig names. Each row corresponds to a sample, with the sample ID in the first column and contig ploidy - * integers in their respective columns. - */ - @Argument( - doc = "Sample ploidy table (.tsv)", - fullName = PLOIDY_TABLE_LONG_NAME - ) - private GATKPath ploidyTablePath; - - @Argument( - doc = "If supplied, generate variant IDs with this prefix", - fullName = VARIANT_PREFIX_LONG_NAME, - optional = true - ) - private String variantPrefix = null; - - /** - * When enabled, DEL and DUP variants will be clustered together. The resulting records with have an SVTYPE of CNV. - */ - @Argument( - doc = "Enable clustering DEL/DUP variants together as CNVs (does not apply to CNV defragmentation)", - fullName = ENABLE_CNV_LONG_NAME, - optional = true - ) - private boolean enableCnv = false; - - /** - * When enabled, INV records will be converted to a pairs of BNDs prior to clustering. - */ - @Argument( - doc = "Convert inversions to BND records", - fullName = CONVERT_INV_LONG_NAME, - optional = true - ) - private boolean convertInversions = false; - - /** - * Results in substantial space and time costs for large sample sets by clearing genotypes that are not needed for - * clustering, but any associated annotation fields will be set to null in the output. - */ - @Argument( - doc = "Fast mode. Drops hom-ref and no-call genotype fields and emits them as no-calls.", - fullName = FAST_MODE_LONG_NAME, - optional = true - ) - private boolean fastMode = false; - - @Argument( - doc = "Omit cluster member ID annotations", - fullName = OMIT_MEMBERS_LONG_NAME, - optional = true - ) - private boolean omitMembers = false; - - @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME, - doc = "Strategy to use for choosing a representative value for a breakpoint cluster.", - optional = true) - private CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy = - CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE; - - @Argument(fullName = JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME, - doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with " + - "different subtypes.", - optional = true) - private CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy = - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE; @Argument(fullName = DEFRAG_PADDING_FRACTION_LONG_NAME, doc = "Padding as a fraction of variant length for CNV defragmentation mode.", @@ -296,51 +176,14 @@ enum CLUSTER_ALGORITHM { ) private double defragSampleOverlapFraction = CNVLinkage.DEFAULT_SAMPLE_OVERLAP; - @Argument(fullName = ALGORITHM_LONG_NAME, - doc = "Clustering algorithm", - optional = true - ) - private CLUSTER_ALGORITHM algorithm = CLUSTER_ALGORITHM.SINGLE_LINKAGE; - - /** - * Default genotypes are assigned when they cannot be inferred from the inputs, such as when VCFs with different - * variants and samples are provided. - */ - @Argument(fullName = DEFAULT_NO_CALL_LONG_NAME, - doc = "Default to no-call GT (e.g. ./.) instead of reference alleles (e.g. 0/0) when a genotype is not" + - " available", - optional = true - ) - private boolean defaultNoCall = false; - @ArgumentCollection private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection(); - private SAMSequenceDictionary dictionary; - private ReferenceSequenceFile reference; - private PloidyTable ploidyTable; - private VariantContextWriter writer; - private VCFHeader header; - private SVClusterEngine clusterEngine; - private Set samples; - private String currentContig; - private int numVariantsBuilt = 0; - - @Override - public boolean requiresReference() { - return true; - } + protected SVClusterEngine clusterEngine; @Override public void onTraversalStart() { - reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier()); - dictionary = reference.getSequenceDictionary(); - if (dictionary == null) { - throw new UserException("Reference sequence dictionary required"); - } - ploidyTable = new PloidyTable(ploidyTablePath.toPath()); - samples = getSamplesForVariants(); - + super.onTraversalStart(); if (algorithm == CLUSTER_ALGORITHM.DEFRAGMENT_CNV) { clusterEngine = SVClusterEngineFactory.createCNVDefragmenter(dictionary, altAlleleSummaryStrategy, reference, defragPaddingFraction, defragSampleOverlapFraction); @@ -354,107 +197,21 @@ public void onTraversalStart() { } else { throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name()); } - - writer = createVCFWriter(outputFile); - header = createHeader(); - writer.writeHeader(header); - currentContig = null; } @Override public Object onTraversalSuccess() { - write(true); + clusterEngine.flush().stream().forEach(this::write); return super.onTraversalSuccess(); } @Override public void closeTool() { super.closeTool(); - if (writer != null) { - writer.close(); - } } @Override - public void apply(final VariantContext variant, final ReadsContext readsContext, - final ReferenceContext referenceContext, final FeatureContext featureContext) { - final SVCallRecord call = SVCallRecordUtils.create(variant, dictionary); - final SVCallRecord filteredCall; - if (fastMode && call.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) { - // Strip out non-carrier genotypes to save memory and compute - // Don't do for multi-allelic CNVs since carrier status can't be determined - final GenotypesContext filteredGenotypes = GenotypesContext.copy(call.getCarrierGenotypeList()); - filteredCall = SVCallRecordUtils.copyCallWithNewGenotypes(call, filteredGenotypes); - } else { - filteredCall = call; - } - - // Update current contig - if (!filteredCall.getContigA().equals(currentContig)) { - currentContig = filteredCall.getContigA(); - logger.info("Processing contig " + currentContig + "..."); - } - - // Add to clustering buffer - if (convertInversions) { - SVCallRecordUtils.convertInversionsToBreakends(filteredCall, dictionary).forEachOrdered(clusterEngine::add); - } else { - clusterEngine.add(filteredCall); - } - - write(false); - } - - private void write(final boolean force) { - final List records = force ? clusterEngine.forceFlush() : clusterEngine.flush(); - records.stream().map(this::buildVariantContext).forEachOrdered(writer::add); + public void applyRecord(final SVCallRecord record) { + clusterEngine.addAndFlush(record).stream().forEach(this::write); } - - private VCFHeader createHeader() { - final VCFHeader header = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples); - header.setSequenceDictionary(dictionary); - - // Required info lines - header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN)); - header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE)); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.END2_ATTRIBUTE, 1, - VCFHeaderLineType.Integer, "Second position")); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, 1, - VCFHeaderLineType.String, "Second contig")); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRANDS_ATTRIBUTE, 1, - VCFHeaderLineType.String, "First and second strands")); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, - VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Source algorithms")); - if (!omitMembers) { - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, - VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Cluster variant ids")); - } - - // Required format lines - header.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); - - return header; - } - - public VariantContext buildVariantContext(final SVCallRecord call) { - // Add genotypes for missing samples - final GenotypesContext filledGenotypes = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles( - call, samples, !defaultNoCall, ploidyTable, header); - - // Assign new variant ID - final String newId = variantPrefix == null ? call.getId() : String.format("%s%08x", variantPrefix, numVariantsBuilt++); - - // Build new variant - final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(), - call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(), - call.getComplexEventIntervals(), call.getLength(), call.getAlgorithms(), call.getAlleles(), filledGenotypes, - call.getAttributes(), call.getFilters(), call.getLog10PError(), dictionary); - final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall); - if (omitMembers) { - builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY); - } - return builder.make(); - } - } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java index 56c389539c6..18f405cef06 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java @@ -199,7 +199,7 @@ protected SVCallRecord minimizeTruthFootprint(final SVCallRecord item) { final List genotypes = item.getGenotypes().stream().map(SVConcordance::stripTruthGenotype).collect(Collectors.toList()); return new SVCallRecord(item.getId(), item.getContigA(), item.getPositionA(), item.getStrandA(), item.getContigB(), item.getPositionB(), item.getStrandB(), item.getType(), - item.getComplexSubtype(), item.getComplexEventIntervals(), item.getLength(), item.getAlgorithms(), + item.getComplexSubtype(), item.getComplexEventIntervals(), item.getLength(), item.getEvidence(), item.getAlgorithms(), item.getAlleles(), genotypes, item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java new file mode 100644 index 00000000000..0dc560c90ca --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java @@ -0,0 +1,330 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.util.Locatable; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; +import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine; +import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection; +import org.broadinstitute.hellbender.utils.*; + +import java.io.File; +import java.nio.file.Path; +import java.util.*; +import java.util.stream.Collectors; + +/** + *

Stratifies structural variants into mutually exclusive groups according to the following customizable criteria: + *

    + *
  • SV type
  • + *
  • Size range
  • + *
  • Reference track overlap
  • + *
+ * Records are annotated with their respective strata names in the {@link GATKSVVCFConstants#STRATUM_INFO_KEY} INFO + * field. Users must provide a stratification configuration .tsv file (tab-delimited table) with the following column + * header on the first line: + *
    + *
  1. NAME
  2. + *
  3. SVTYPE
  4. + *
  5. MIN_SIZE
  6. + *
  7. MAX_SIZE
  8. + *
  9. TRACKS
  10. + *
+ *

+ *

For example:

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
NAMESVTYPEMIN_SIZEMAX_SIZETRACKS
DEL_large_SDDEL5000-1SD
DUP_large_SDDUP5000-1SD
DEL_small_SR_RMDEL-15000SR,RM
DUP_small_SR_RMDUP-15000SR,RM
INS_SRINS-1-1SR
+ *

+ * The "NAME" column is an arbitrary identifier, "SVTYPE" is the class of variant (DEL, DUP, INS, etc.), MIN_SIZE in an + * inclusive size lower-bound, MAX_SIZE is an exclusive size upper-bound, and TRACKS is a comma-delimited list of + * reference tracks defined using the {@link SVStratificationEngineArgumentsCollection#trackFileList} and + * {@link SVStratificationEngineArgumentsCollection#trackNameList} parameters. For example, + *

+ *
+ *     gatk GroupedSVCluster \
+ *       --track-name RM \
+ *       --track-intervals repeatmasker.bed \
+ *       --track-name SD \
+ *       --track-intervals segmental_duplications.bed \
+ *       --track-name SR \
+ *       --track-intervals simple_repeats.bed \
+ *       ...
+ * 
+ *

+ * The MIN_SIZE, MAX_SIZE, and TRACKS columns may contain null values {"-1", "", "NULL", "NA"}. Null MIN_SIZE and + * MAX_SIZE correspond to negative and positive infinity, respectively, and a null TRACKS value means that variants + * will not be matched based on track. Variants with undefined SVLEN will only match if both MIN_SIZE and MAX_SIZE + * are null. + *

+ * + *

The {@link SVStratificationEngineArgumentsCollection#overlapFraction}, + * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps}, and + * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom} can be used to modify the overlap + * criteria for assigning variants to each group based on overlap with the given reference track intervals. By + * default, only one endpoint of the variant needs to lie in a track interval in order to match. INS variants are + * treated as single points and only {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps} is used, + * ignoring {@link SVStratificationEngineArgumentsCollection#overlapFraction}. Similarly, CTX and BND variant + * overlap is only defined by {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom}. + *

+ * + *

By default, each stratification group must be mutually exclusive, meaning that any given SV can only belong to + * one group. An error is thrown if the tool encounters a variant that meets the criteria for more than one group. + * This restriction can be overridden with the {@link SVStratify#ALLOW_MULTIPLE_MATCHES_LONG_NAME} argument, in which + * case the record will be written out multiple times: once for each matching stratification group with the corresponding + * {@link GATKSVVCFConstants#STRATUM_INFO_KEY} value. Furthermore, SVs that do not match any of the groups will be + * annotated with the {@link SVStratify#DEFAULT_STRATUM} group.

+ * + *

If using {@link #SPLIT_OUTPUT_LONG_NAME} then the tool generates a set of VCFs as output with each VCF containing + * the records of each group.

+ * + *

This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.

+ * + *

Inputs

+ * + *
    + *
  • + * One or more SV VCFs + *
  • + *
  • + * Stratification configuration TSV file + *
  • + *
  • + * Reference dictionary + *
  • + *
+ * + *

Output

+ * + *
    + *
  • + * Annotated VCF(s) + *
  • + *
+ * + *

Usage example, generating stratified VCFs:

+ * + *
+ *     gatk SVStratify \
+ *       -V variants.vcf.gz \
+ *       --split-output \
+ *       -O ./ \
+ *       --output-prefix out \
+ *       --sequence-dictionary reference.dict \
+ *       --track-name RM \
+ *       --track-intervals repeatmasker.bed \
+ *       --stratify-config strata.tsv
+ * 
+ * + *

Usage example, a single annotated VCF:

+ * + *
+ *     gatk SVStratify \
+ *       -V variants.vcf.gz \
+ *       -O out.vcf.gz \
+ *       --sequence-dictionary reference.dict \
+ *       --track-name RM \
+ *       --track-intervals repeatmasker.bed \
+ *       --stratify-config strata.tsv
+ * 
+ * + * @author Mark Walker <markw@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Annotates variants by SV type, size, and reference tracks", + oneLineSummary = "Annotates variants by SV type, size, and reference tracks", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public final class SVStratify extends MultiVariantWalker { + + public static final String ALLOW_MULTIPLE_MATCHES_LONG_NAME = "allow-multiple-matches"; + public static final String SPLIT_OUTPUT_LONG_NAME = "split-output"; + + // Default output group name for unmatched records + public static final String DEFAULT_STRATUM = "default"; + + @Argument( + doc = "Output path. Must be a directory if using --" + SPLIT_OUTPUT_LONG_NAME, + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME + ) + private GATKPath outputPath; + + @Argument( + doc = "Prefix for output filenames, only if using --" + SPLIT_OUTPUT_LONG_NAME, + fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, + optional = true + ) + private String outputPrefix; + + @ArgumentCollection + private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection(); + + @Argument( + doc = "Do not enforce mutual exclusivity for each stratification group", + fullName = ALLOW_MULTIPLE_MATCHES_LONG_NAME + ) + private boolean allowMultipleMatches = false; + + @Argument( + doc = "Split output into multiple VCFs, one per stratification group. If used, then --" + + StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be the output directory and --" + + CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " must be provided.", + fullName = SPLIT_OUTPUT_LONG_NAME + ) + private boolean splitOutput = false; + + protected SAMSequenceDictionary dictionary; + protected Map writers; + protected SVStatificationEngine engine; + + @Override + public void onTraversalStart() { + super.onTraversalStart(); + dictionary = getMasterSequenceDictionary(); + Utils.validateArg(dictionary != null, "Reference dictionary is required; please specify with --" + + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME); + engine = loadStratificationConfig(stratArgs, dictionary); + logger.debug("Loaded stratification groups:"); + for (final SVStatificationEngine.Stratum s : engine.getStrata()) { + logger.debug(s); + } + initializeWriters(); + } + + protected void createGroupWriter(final String name, final Path path) { + final VariantContextWriter writer = createVCFWriter(path); + final VCFHeader header = new VCFHeader(getHeaderForVariants()); + addStratifyMetadata(header); + writer.writeHeader(header); + if (writers.containsKey(name)) { + throw new GATKException.ShouldNeverReachHereException("Stratification name already exists: " + name); + } + writers.put(name, writer); + } + + public static void addStratifyMetadata(final VCFHeader header) { + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRATUM_INFO_KEY, 1, + VCFHeaderLineType.String, "Stratum ID")); + } + + protected Path generateGroupOutputPath(final String name) { + final String filename = outputPrefix + "." + name + ".vcf.gz"; + return outputPath.toPath().resolve(filename); + } + + protected void initializeWriters() { + writers = new HashMap<>(); + if (splitOutput) { + Utils.validateArg(outputPrefix != null, "Argument --" + CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " required if using --" + SPLIT_OUTPUT_LONG_NAME); + Utils.validateArg(new File(outputPath.toString()).isDirectory(), "Argument --" + StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be a directory if using " + SPLIT_OUTPUT_LONG_NAME); + createGroupWriter(DEFAULT_STRATUM, generateGroupOutputPath(DEFAULT_STRATUM)); + for (final SVStatificationEngine.Stratum s : engine.getStrata()) { + createGroupWriter(s.getName(), generateGroupOutputPath(s.getName())); + } + } else { + createGroupWriter(DEFAULT_STRATUM, outputPath.toPath()); + } + } + + /** + * Reusable method for loading the stratification configuration table. See tool doc for the expected format. + */ + public static SVStatificationEngine loadStratificationConfig(final SVStratificationEngineArgumentsCollection args, + final SAMSequenceDictionary dictionary) { + Utils.validateArg(args.trackNameList.size() == args.trackFileList.size(), "Arguments --" + + SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME + " and --" + SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME + + " must be specified the same number of times."); + final Map> map = new HashMap<>(); + final Iterator nameIterator = args.trackNameList.iterator(); + final Iterator pathIterator = args.trackFileList.iterator(); + final GenomeLocParser genomeLocParser = new GenomeLocParser(dictionary); + while (nameIterator.hasNext() && pathIterator.hasNext()) { + final String name = nameIterator.next(); + final GATKPath path = pathIterator.next(); + final GenomeLocSortedSet genomeLocs = IntervalUtils.loadIntervals(Collections.singletonList(path.toString()), IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser); + final List intervals = Collections.unmodifiableList(genomeLocs.toList()); + if (map.containsKey(name)) { + throw new UserException.BadInput("Duplicate track name was specified: " + name); + } + map.put(name, intervals); + } + final SVStatificationEngine engine = SVStatificationEngine.create(map, args.configFile, dictionary); + if (engine.getStrata().stream().anyMatch(s -> s.getName().equals(DEFAULT_STRATUM))) { + throw new UserException.BadInput("Stratification configuration contains entry with reserved " + + "ID \"" + DEFAULT_STRATUM + "\""); + } + return engine; + } + + @Override + public void closeTool() { + for (final VariantContextWriter writer : writers.values()) { + writer.close(); + } + super.closeTool(); + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext) { + // Save a ton of compute by not copying genotypes into the new record + final VariantContext variantNoGenotypes = new VariantContextBuilder(variant).genotypes(Collections.emptyList()).make(); + final SVCallRecord record = SVCallRecordUtils.create(variantNoGenotypes, dictionary); + final Collection stratifications = engine.getMatches(record, + stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom); + final VariantContextBuilder builder = new VariantContextBuilder(variant); + if (stratifications.isEmpty()) { + writers.get(DEFAULT_STRATUM).add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, DEFAULT_STRATUM).make()); + } else { + if (!allowMultipleMatches && stratifications.size() > 1) { + final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList())); + throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Bypass this error using the --" + ALLOW_MULTIPLE_MATCHES_LONG_NAME + " argument"); + } + for (final SVStatificationEngine.Stratum stratum : stratifications) { + final VariantContextWriter writer = splitOutput ? writers.get(stratum.getName()) : writers.get(DEFAULT_STRATUM); + if (writer == null) { + throw new GATKException("Writer not found for group: " + stratum.getName()); + } + writer.add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, stratum.getName()).make()); + } + } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java index 35864d62f1a..f0c45c8f934 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java @@ -76,7 +76,7 @@ public Object[][] testCreateInvalidCoordinatesData() { @Test(dataProvider="testCreateInvalidCoordinatesData", expectedExceptions = { IllegalArgumentException.class }) public void testCreateInvalidCoordinates(final String contigA, final int posA, final String contigB, final int posB) { new SVCallRecord("var1", contigA, posA, true, contigB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, - null, Collections.emptyList(), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), + null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); Assert.fail("Expected exception not thrown"); } @@ -93,14 +93,14 @@ public Object[][] testCreateValidCoordinatesData() { @Test(dataProvider="testCreateValidCoordinatesData") public void testCreateValidCoordinates(final String contigA, final int posA, final String contigB, final int posB) { new SVCallRecord("var1", contigA, posA, true, contigB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, - null, Collections.emptyList(), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), + null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); } @Test public void testGetters() { final SVCallRecord record = new SVCallRecord("var1", "chr1", 100, true, "chr1", 200, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, - GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict)), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), + GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict)), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), GenotypesContext.create(GenotypeBuilder.create("sample1", Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL))), Collections.singletonMap("TEST_KEY", "TEST_VALUE"), Collections.singleton("TEST_FILTER"), Double.valueOf(30), SVTestUtils.hg38Dict); Assert.assertEquals(record.getId(), "var1"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java index 2c814a45576..1ee6107f3bb 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java @@ -69,7 +69,7 @@ public Object[][] testGetVariantBuilderData() { return new Object[][]{ // DEL { - new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, + new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(), SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST, ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2), Collections.emptyMap(), Collections.singleton("TEST_FILTER"), Double.valueOf(-3)), @@ -86,7 +86,7 @@ public Object[][] testGetVariantBuilderData() { }, // DEL w/ null ref allele { - new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, + new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(), SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST, Collections.singletonList(Allele.SV_SIMPLE_DEL), Collections.singletonList(GENOTYPE_DEL_3), @@ -102,7 +102,7 @@ public Object[][] testGetVariantBuilderData() { }, // INS { - new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, + new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1), @@ -119,7 +119,7 @@ public Object[][] testGetVariantBuilderData() { }, // INS, flipped strands { - new SVCallRecord("var2", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, + new SVCallRecord("var2", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1), @@ -136,7 +136,7 @@ public Object[][] testGetVariantBuilderData() { }, // INS, undefined length { - new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, + new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1), @@ -153,7 +153,7 @@ public Object[][] testGetVariantBuilderData() { }, // BND { - new SVCallRecord("var_bnd", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, + new SVCallRecord("var_bnd", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Lists.newArrayList(GENOTYPE_BND_1), @@ -172,7 +172,7 @@ public Object[][] testGetVariantBuilderData() { }, // CTX { - new SVCallRecord("var_ctx", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, + new SVCallRecord("var_ctx", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CTX, Lists.newArrayList(GENOTYPE_CTX_1), @@ -196,6 +196,7 @@ public Object[][] testGetVariantBuilderData() { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:5000-5100", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("DEL_chr2:100-200", SVTestUtils.hg38Dict)), 100, + Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Lists.newArrayList(GENOTYPE_CPX_1), @@ -223,7 +224,7 @@ public void testGetVariantBuilder(final SVCallRecord record, final VariantContex @Test public void testGetVariantBuilderHasSanitizedNullAttributes() { - final SVCallRecord record = new SVCallRecord("var3", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, + final SVCallRecord record = new SVCallRecord("var3", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Lists.newArrayList(GENOTYPE_BND_1), @@ -300,14 +301,14 @@ public void testFillMissingSamplesWithGenotypes() { @Test public void testCopyCallWithNewGenotypes() { - final SVCallRecord record = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, + final SVCallRecord record = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(), SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST, ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2), Collections.singletonMap(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList("sample")), Collections.emptySet(), null); final GenotypesContext genotypes = GenotypesContext.copy(Collections.singletonList(GENOTYPE_DEL_3)); final SVCallRecord result = SVCallRecordUtils.copyCallWithNewGenotypes(record, genotypes); - final SVCallRecord expected = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, + final SVCallRecord expected = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(), SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST, ALLELES_DEL, genotypes, @@ -449,7 +450,7 @@ public void testConvertInversionsToBreakends() { Assert.assertNotNull(nonInversionResult.get(0)); SVTestUtils.assertEqualsExceptMembership(nonInversionResult.get(0), nonInversion); - final SVCallRecord inversion = new SVCallRecord("", "chr1", 1000, true, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, Collections.emptyList(), 1000, + final SVCallRecord inversion = new SVCallRecord("", "chr1", 1000, true, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, Collections.emptyList(), 1000, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), @@ -524,7 +525,7 @@ public Object[][] testCreateData() { GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), null, null, TEST_ATTRIBUTES, -90.), - new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, + new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2), TEST_ATTRIBUTES, Collections.emptySet(), -90.) }, @@ -534,7 +535,7 @@ public Object[][] testCreateData() { GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), null, null, TEST_ATTRIBUTES, null), - new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, + new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -543,7 +544,7 @@ public Object[][] testCreateData() { ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), 500, "+-", GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, null, null, TEST_ATTRIBUTES, null), - new SVCallRecord("var3", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, + new SVCallRecord("var3", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -552,7 +553,7 @@ public Object[][] testCreateData() { ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), 500, "-+", GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, null, null, TEST_ATTRIBUTES, null), - new SVCallRecord("var4", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, + new SVCallRecord("var4", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -561,7 +562,7 @@ public Object[][] testCreateData() { ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), -1, "-+", GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, null, null, TEST_ATTRIBUTES, null), - new SVCallRecord("var4b", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, + new SVCallRecord("var4b", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -570,7 +571,7 @@ public Object[][] testCreateData() { ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), null, "++", GATKSVVCFConstants.StructuralVariantAnnotationType.BND, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, "chrX", 2000, TEST_ATTRIBUTES, null), - new SVCallRecord("var5", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, + new SVCallRecord("var5", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -579,7 +580,7 @@ public Object[][] testCreateData() { ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), null, "++", GATKSVVCFConstants.StructuralVariantAnnotationType.BND, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, "chrX", 2000, TEST_ATTRIBUTES, null), - new SVCallRecord("var6", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, + new SVCallRecord("var6", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -588,7 +589,7 @@ public Object[][] testCreateData() { ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, "chrX", 2000, TEST_ATTRIBUTES_CPX, null), - new SVCallRecord("var7", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, + new SVCallRecord("var7", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -597,7 +598,7 @@ public Object[][] testCreateData() { ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, "chr1", null, TEST_ATTRIBUTES_CPX, null), - new SVCallRecord("var8", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, + new SVCallRecord("var8", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -606,7 +607,7 @@ public Object[][] testCreateData() { ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, null, null, TEST_ATTRIBUTES_CPX, null), - new SVCallRecord("var9", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, + new SVCallRecord("var9", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -622,6 +623,7 @@ public Object[][] testCreateData() { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("DEL_chr2:300-400", SVTestUtils.hg38Dict)), 250, + Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -630,7 +632,7 @@ public Object[][] testCreateData() { ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1), null, "++", GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, "chrX", 2000, TEST_ATTRIBUTES_CTX, null), - new SVCallRecord("var11", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, + new SVCallRecord("var11", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, @@ -642,7 +644,7 @@ public Object[][] testCreateData() { "chrX", 2000, Map.of("TEST_KEY", "TEST_VAL", GATKSVVCFConstants.CPX_TYPE, "CTX_PP/QQ", GATKSVVCFConstants.CPX_INTERVALS, Collections.emptyList()), null), - new SVCallRecord("var12", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, + new SVCallRecord("var12", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1), TEST_ATTRIBUTES, Collections.emptySet(), null) }, diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java index 5a8e97a05fd..110516ac935 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java @@ -10,7 +10,10 @@ import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; import org.broadinstitute.hellbender.tools.spark.sv.discovery.SimpleSVType; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import org.broadinstitute.hellbender.tools.sv.cluster.*; +import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVCollapser; +import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVLinkage; +import org.broadinstitute.hellbender.tools.sv.cluster.ClusteringParameters; +import org.broadinstitute.hellbender.tools.sv.cluster.SVClusterEngine; import org.broadinstitute.hellbender.utils.GenomeLoc; import org.broadinstitute.hellbender.utils.GenomeLocParser; import org.broadinstitute.hellbender.utils.reference.ReferenceUtils; @@ -33,7 +36,8 @@ public class SVTestUtils { new CanonicalSVCollapser( hg38Reference, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END); + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.FlagFieldLogic.OR); public static final String PESR_ALGORITHM = "pesr"; public static final List DEPTH_ONLY_ALGORITHM_LIST = Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM); @@ -121,6 +125,21 @@ public static SVClusterEngine getNewDefaultMaxCliqueEngine() { .attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0); public final static SVCallRecord makeRecord(final String id, + final String contigA, + final int positionA, + final Boolean strandA, + final String contigB, + final int positionB, + final Boolean strandB, + final GATKSVVCFConstants.StructuralVariantAnnotationType type, + final Integer length, + final List algorithms, + final List alleles, + final List genotypeBuilders) { + return makeRecordWithEvidenceAndQuality(id, contigA, positionA, strandA, contigB, positionB, strandB, type, length, Collections.emptyList(), algorithms, alleles, genotypeBuilders, null); + } + + public final static SVCallRecord makeRecordWithEvidenceAndQuality(final String id, final String contigA, final int positionA, final Boolean strandA, @@ -129,17 +148,19 @@ public final static SVCallRecord makeRecord(final String id, final Boolean strandB, final GATKSVVCFConstants.StructuralVariantAnnotationType type, final Integer length, + final List evidence, final List algorithms, final List alleles, - final List genotypeBuilders) { + final List genotypeBuilders, + final Double log10PError) { final Allele refAllele = Allele.create(ReferenceUtils.getRefBaseAtPosition(hg38Reference, contigA, positionA), true); final List newAlleles = replaceRefAlleles(alleles, refAllele); final List genotypes = new ArrayList<>(genotypeBuilders.size()); for (final GenotypeBuilder builder : genotypeBuilders) { genotypes.add(makeGenotypeWithRefAllele(builder, refAllele)); } - return new SVCallRecord(id, contigA, positionA, strandA, contigB, positionB, strandB, type, null, Collections.emptyList(), length, algorithms, - newAlleles, genotypes, Collections.emptyMap(), Collections.emptySet(), null, hg38Dict); + return new SVCallRecord(id, contigA, positionA, strandA, contigB, positionB, strandB, type, null, Collections.emptyList(), length, evidence, algorithms, + newAlleles, genotypes, Collections.emptyMap(), Collections.emptySet(), log10PError, hg38Dict); } public static final Genotype makeGenotypeWithRefAllele(final GenotypeBuilder builder, final Allele refAllele) { @@ -378,16 +399,16 @@ public static SVCallRecord newCallRecordWithAllelesAndSampleName(final String sa builder = builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber); } return new SVCallRecord("", "chr1", 100, getValidTestStrandA(svtype), "chr1", 199, getValidTestStrandB(svtype), - svtype, null, Collections.emptyList(), 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + svtype, null, Collections.emptyList(), 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), variantAlleles, Collections.singletonList(builder.make()), Collections.emptyMap(), Collections.emptySet(), null); } - public static SVCallRecord newNamedDeletionRecordWithAttributes(final String id, final Map attributes) { - return new SVCallRecord(id, "chr1", 100, true, "chr1", 199, false, + public static SVCallRecord newDeletionRecordWithAttributes(final Map attributes) { + return new SVCallRecord("", "chr1", 100, true, "chr1", 199, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), attributes, Collections.emptySet(), null); @@ -398,7 +419,7 @@ public static SVCallRecord newNamedDeletionRecordWithAttributesAndGenotypes(fina final Map attributes) { return new SVCallRecord(id, "chr1", 100, true, "chr1", 199, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), genotypes, attributes, Collections.emptySet(), null); @@ -416,40 +437,40 @@ public static final Map keyValueArraysToMap(final String[] keys, public static SVCallRecord newCallRecordWithLengthAndType(final Integer length, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) { final int positionB = length == null ? 1 : CoordMath.getEnd(1, length); return new SVCallRecord("", "chr1", 1, getValidTestStrandA(svtype), "chr1", positionB, getValidTestStrandB(svtype), - svtype, null, Collections.emptyList(), length, PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), + svtype, null, Collections.emptyList(), length, Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null); } public static SVCallRecord newDeletionCallRecordWithIdAndAlgorithms(final String id, final List algorithms) { return new SVCallRecord(id, "chr1", 1, true, "chr1", 100, false, - GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 100, algorithms, Collections.emptyList(), + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 100, Collections.emptyList(), algorithms, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null); } // Note strands and length may not be set properly public static SVCallRecord newPESRCallRecordWithIntervalAndType(final int start, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) { return new SVCallRecord("", "chr1", start, getValidTestStrandA(svtype), "chr1", end, getValidTestStrandB(svtype), - svtype, null, Collections.emptyList(), getLength(start, end, svtype), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), + svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null); } // Note strands and length may not be set properly public static SVCallRecord newInsertionWithPositionAndLength(final int start, final int length) { return new SVCallRecord("", "chr1", start, true, "chr1", start + 1, false, - GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), + GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null); } public static SVCallRecord newDepthCallRecordWithIntervalAndType(final int start, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) { return new SVCallRecord("", "chr1", start, getValidTestStrandA(svtype), "chr1", end, getValidTestStrandB(svtype), - svtype, null, Collections.emptyList(), getLength(start, end, svtype), DEPTH_ONLY_ALGORITHM_LIST, Collections.emptyList(), + svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), DEPTH_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null); } // Note strands and length may not be set properly public static SVCallRecord newCallRecordWithContigsIntervalAndType(final String startContig, final int start, final String endContig, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) { return new SVCallRecord("", startContig, start, getValidTestStrandA(svtype), endContig, end, getValidTestStrandB(svtype), - svtype, null, Collections.emptyList(), getLength(start, end, svtype), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), + svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null); } @@ -463,7 +484,7 @@ public static Integer getLength(final int start, final int end, final GATKSVVCFC } public static SVCallRecord newBndCallRecordWithStrands(final boolean strandA, final boolean strandB) { - return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1000, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, + return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1000, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(), Collections.singletonList(PESR_ALGORITHM), Collections.emptyList(), Collections.emptyList(), @@ -471,7 +492,7 @@ public static SVCallRecord newBndCallRecordWithStrands(final boolean strandA, fi } public static SVCallRecord newCtxCallRecord() { - return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, + return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, Collections.emptyList(), Collections.singletonList(PESR_ALGORITHM), Collections.emptyList(), Collections.emptyList(), @@ -479,7 +500,7 @@ public static SVCallRecord newCtxCallRecord() { } public static SVCallRecord newCpxCallRecordWithLength(final int length) { - return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, null, Collections.emptyList(), length, + return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, null, Collections.emptyList(), length, Collections.emptyList(), Collections.singletonList(PESR_ALGORITHM), Collections.emptyList(), Collections.emptyList(), @@ -487,7 +508,7 @@ public static SVCallRecord newCpxCallRecordWithLength(final int length) { } public static SVCallRecord newCnvCallRecordWithStrands(final Boolean strandA, final Boolean strandB) { - return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1999, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(), 1000, + return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1999, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(), 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), @@ -495,7 +516,7 @@ public static SVCallRecord newCnvCallRecordWithStrands(final Boolean strandA, fi } public static SVCallRecord newCallRecordWithCoordinates(final String id, final String chrA, final int posA, final String chrB, final int posB) { - return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, + return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(), Collections.singletonList("peser"), Collections.emptyList(), Collections.emptyList(), @@ -503,7 +524,7 @@ public static SVCallRecord newCallRecordWithCoordinates(final String id, final S } public static SVCallRecord newCallRecordWithCoordinatesAndType(final String id, final String chrA, final int posA, final String chrB, final int posB, final GATKSVVCFConstants.StructuralVariantAnnotationType type) { - return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, type, null, Collections.emptyList(), getLength(posA, posB, type), + return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, type, null, Collections.emptyList(), getLength(posA, posB, type), Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), @@ -511,7 +532,7 @@ public static SVCallRecord newCallRecordWithCoordinatesAndType(final String id, } public static SVCallRecord newCallRecordWithAlgorithms(final List algorithms) { - return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, + return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(), algorithms, Collections.emptyList(), Collections.emptyList(), @@ -519,7 +540,15 @@ public static SVCallRecord newCallRecordWithAlgorithms(final List algori } public static SVCallRecord newCallRecordInsertionWithLength(final Integer length) { - return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, + return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(), + PESR_ONLY_ALGORITHM_LIST, + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyMap(), Collections.emptySet(), null); + } + + public static SVCallRecord newCallRecordInsertionWithLengthAndCoordinates(final String chrA, final int posA, final Integer length) { + return new SVCallRecord("", chrA, posA, true, chrA, posA, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), @@ -597,4 +626,18 @@ public static GenotypeBuilder getDiploidCNVGenotypeBuilder(final String sample, .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2) .attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber); } + + public static Map buildMapFromArrays(final String[] keys, final Object[] values) { + if (keys.length != values.length) { + throw new TestException("Keys and values have different lengths: " + keys.length + " and " + values.length); + } + final Map map = new HashMap<>(); + for (int i = 0; i < keys.length; i++) { + if (keys[i] == null) { + throw new TestException("Encountered null key"); + } + map.put(keys[i], values[i]); + } + return map; + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java index 611763ff23c..384df0f0ef2 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java @@ -9,6 +9,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -95,20 +96,22 @@ public void testGetMaxClusterableStartingPosition() { public void testAdd() { //single-sample merge case, ignoring sample sets final SVClusterEngine temp1 = SVClusterEngineFactory.createBinnedCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, paddingFraction, 0.8, SVTestUtils.targetIntervals); - temp1.add(SVTestUtils.call1); + final List output1 = new ArrayList<>(); + output1.addAll(temp1.addAndFlush(SVTestUtils.call1)); //force new cluster by adding a non-overlapping event - temp1.add(SVTestUtils.call3); - final List output1 = temp1.forceFlush(); //flushes all clusters + output1.addAll(temp1.addAndFlush(SVTestUtils.call3)); + output1.addAll(temp1.flush()); //flushes all clusters Assert.assertEquals(output1.size(), 2); SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0)); SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call3, output1.get(1)); final SVClusterEngine temp2 = SVClusterEngineFactory.createBinnedCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, paddingFraction, 0.8, SVTestUtils.targetIntervals); - temp2.add(SVTestUtils.call1); - temp2.add(SVTestUtils.call2); //should overlap after padding + final List output2 = new ArrayList<>(); + output2.addAll(temp2.addAndFlush(SVTestUtils.call1)); + output2.addAll(temp2.addAndFlush(SVTestUtils.call2)); //should overlap after padding //force new cluster by adding a call on another contig - temp2.add(SVTestUtils.call4_chr10); - final List output2 = temp2.forceFlush(); + output2.addAll(temp2.addAndFlush(SVTestUtils.call4_chr10)); + output2.addAll(temp2.flush()); Assert.assertEquals(output2.size(), 2); Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA()); Assert.assertEquals(output2.get(0).getPositionB(), SVTestUtils.call2.getPositionB()); @@ -116,9 +119,10 @@ public void testAdd() { //cohort case, checking sample set overlap final SVClusterEngine temp3 = SVClusterEngineFactory.createCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, CNVLinkage.DEFAULT_PADDING_FRACTION, CNVLinkage.DEFAULT_SAMPLE_OVERLAP); - temp3.add(SVTestUtils.call1); - temp3.add(SVTestUtils.sameBoundsSampleMismatch); - final List output3 = temp3.forceFlush(); + final List output3 = new ArrayList<>(); + output3.addAll(temp3.addAndFlush(SVTestUtils.call1)); + output3.addAll(temp3.addAndFlush(SVTestUtils.sameBoundsSampleMismatch)); + output3.addAll(temp3.flush()); Assert.assertEquals(output3.size(), 2); } } \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java index 012c302bd83..cb540d4988e 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java @@ -21,47 +21,47 @@ public class CNVDefragmenterTest { @Test public void testClusterTogether() { final SVCallRecord deletion = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); final SVCallRecord duplication = new SVCallRecord("test_dup", "chr1", 1000, false, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertFalse(defragmenter.areClusterable(deletion, duplication), "Different sv types should not cluster"); final SVCallRecord duplicationNonDepthOnly = new SVCallRecord("test_dup", "chr1", 1000, false, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, null, Collections.emptyList(), - 1000, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), + 1000, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertFalse(defragmenter.areClusterable(duplication, duplicationNonDepthOnly), "Clustered records must be depth-only"); final SVCallRecord cnv = new SVCallRecord("test_cnv", "chr1", 1000, null, "chr1", 1999, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertFalse(defragmenter.areClusterable(deletion, cnv), "Different sv types should not cluster"); final SVCallRecord insertion = new SVCallRecord("test_ins", "chr1", 1000, true, "chr1", 1001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), - 1000, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, + 1000, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertFalse(defragmenter.areClusterable(insertion, insertion), "Only CNVs should be valid"); final SVCallRecord deletion2 = new SVCallRecord("test_del2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertTrue(defragmenter.areClusterable(deletion, deletion2), "Valid identical records should cluster"); final SVCallRecord deletion3 = new SVCallRecord("test_del3", "chr1", 2999, true, "chr1", 3998, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertTrue(defragmenter.areClusterable(deletion, deletion3), "Should cluster due to overlap"); final SVCallRecord deletion4 = new SVCallRecord("test_del3", "chr1", 3000, true, "chr1", 3999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertFalse(defragmenter.areClusterable(deletion, deletion4), "Should barely not cluster"); @@ -190,7 +190,7 @@ public Object[][] recordPairs() { @Test(dataProvider= "maxPositionIntervals") public void testGetMaxClusterableStartingPosition(final int start, final int end) { final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start, true, "chr1", end, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - end - start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + end - start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); final int maxClusterableStart = defragmenter.getMaxClusterableStartingPosition(call1); @@ -198,7 +198,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end final int call2Start = maxClusterableStart; final int call2End = dictionary.getSequence(call1.getContigA()).getSequenceLength(); final SVCallRecord call2 = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2End, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - call2End - call2Start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + call2End - call2Start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertTrue(defragmenter.areClusterable(call1, call2)); @@ -206,7 +206,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end final int call3Start = maxClusterableStart + 1; final int call3End = dictionary.getSequence(call1.getContigA()).getSequenceLength(); final SVCallRecord call3 = new SVCallRecord("call3", "chr1", call3Start, true, "chr1", call3End, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - call3End - call3Start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + call3End - call3Start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary); Assert.assertFalse(defragmenter.areClusterable(call1, call3)); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java index f46e4a6e7c9..3a692ab4001 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java @@ -26,27 +26,43 @@ public class CanonicalSVCollapserUnitTest { private static final CanonicalSVCollapser collapser = new CanonicalSVCollapser( SVTestUtils.hg38Reference, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END); + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.FlagFieldLogic.OR); private static final CanonicalSVCollapser collapserMinMax = new CanonicalSVCollapser( SVTestUtils.hg38Reference, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END); + CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, + CanonicalSVCollapser.FlagFieldLogic.OR); private static final CanonicalSVCollapser collapserMaxMin = new CanonicalSVCollapser( SVTestUtils.hg38Reference, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END); + CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END, + CanonicalSVCollapser.FlagFieldLogic.OR); private static final CanonicalSVCollapser collapserMean = new CanonicalSVCollapser( SVTestUtils.hg38Reference, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END); + CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END, + CanonicalSVCollapser.FlagFieldLogic.OR); private static final CanonicalSVCollapser collapserRepresentative = new CanonicalSVCollapser( SVTestUtils.hg38Reference, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE); + CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE, + CanonicalSVCollapser.FlagFieldLogic.OR); private static final CanonicalSVCollapser collapserSpecificAltAllele = new CanonicalSVCollapser( SVTestUtils.hg38Reference, CanonicalSVCollapser.AltAlleleSummaryStrategy.MOST_SPECIFIC_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END); + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.FlagFieldLogic.OR); + private static final CanonicalSVCollapser collapserFlagAnd = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.FlagFieldLogic.AND); + private static final CanonicalSVCollapser collapserFlagAlwaysFalse = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.FlagFieldLogic.ALWAYS_FALSE); private static final Allele MEI_INSERTION_ALLELE = Allele.create(""); private static final Allele SVA_INSERTION_ALLELE = Allele.create(""); @@ -1234,15 +1250,160 @@ public void collapseLengthTest(final SVCallRecord record, Assert.assertEquals(collapser.collapseLength(record, type, record.getPositionA(), record.getPositionB()), expectedLength); } - @DataProvider(name = "collapseIdsTestData") - public Object[][] collapseIdsTestData() { + + @DataProvider(name = "collapseAttributesTestData") + public Object[][] collapseAttributesTestData() { return new Object[][]{ - {Collections.singletonList("var1"), "var1"}, - {Lists.newArrayList("var1", "var2"), "var1"}, - {Lists.newArrayList("var2", "var1"), "var1"}, + // Empty case + { + new String[]{}, new Object[]{}, + new String[]{}, new Object[]{}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + // Use representative + { + new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"}, + new String[]{}, new Object[]{}, + new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{}, new Object[]{}, + new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"}, + new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE2"}, + new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"}, + new String[]{"TEST_KEY1", "TEST_KEY2"}, new Object[]{"TEST_VALUE12", "TEST_VALUE22"}, + new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + // Reserved flags OR + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{}, new Object[]{}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{}, new Object[]{}, // False results in non-assignment, implying false + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{}, new Object[]{}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{}, new Object[]{}, + new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + { + new String[]{}, new Object[]{}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE}, + CanonicalSVCollapser.FlagFieldLogic.OR + }, + // Reserved flags AND + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{}, new Object[]{}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.AND + }, + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + CanonicalSVCollapser.FlagFieldLogic.AND + }, + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.AND + }, + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.AND + }, + { + new String[]{}, new Object[]{}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.AND + }, + { + new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{}, new Object[]{}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.AND + }, + { + new String[]{}, new Object[]{}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.AND + }, + // Reserved flags ALWAYS_FALSE + { + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE}, + new String[]{}, new Object[]{}, + CanonicalSVCollapser.FlagFieldLogic.ALWAYS_FALSE + }, }; } + @Test(dataProvider= "collapseAttributesTestData") + public void collapseAttributesTest(final String[] representativeKeys, final Object[] representativeValues, + final String[] secondKeys, final Object[] secondValues, + final String[] expectedKeys, final Object[] expectedValues, + final CanonicalSVCollapser.FlagFieldLogic flagLogic) { + final Map representativeMap = SVTestUtils.buildMapFromArrays(representativeKeys, representativeValues); + final Map secondMap = SVTestUtils.buildMapFromArrays(secondKeys, secondValues); + final Map expectedMap = SVTestUtils.buildMapFromArrays(expectedKeys, expectedValues); + final SVCallRecord representativeCall = SVTestUtils.newDeletionRecordWithAttributes(representativeMap); + final SVCallRecord secondCall = SVTestUtils.newDeletionRecordWithAttributes(secondMap); + final Collection collection = Lists.newArrayList(secondCall, representativeCall); + final CanonicalSVCollapser testCollapser = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + flagLogic); + final Map result = new HashMap<>(testCollapser.collapseAttributes(representativeCall, collection)); + // Ignore MEMBERS field + result.remove(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY); + Assert.assertEquals(result, expectedMap); + } + @DataProvider(name = "getMostPreciseCallsTestData") public Object[][] getMostPreciseCallsTestData() { return new Object[][]{ @@ -1428,32 +1589,205 @@ public void collapseIntervalTest(final String[] contigs, final int[] starts, fin collapseIntervalTestHelper(collapserMean, svtype, contigs, records, expectedMean); } - @Test - public void collapseIntervalRepresentativeTest() { + @DataProvider(name = "collapseIntervalRepresentativeTestData") + public Object[][] collapseIntervalRepresentativeTestData() { + return new Object[][]{ + // equal evidence, expect second with more carriers + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + false + }, + { + 0., + null, + new GATKSVVCFConstants.EvidenceTypes[]{}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + false + }, + { + null, + 0., + new GATKSVVCFConstants.EvidenceTypes[]{}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + false + }, + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.SR}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.SR}, + false + }, + { + -99., + -99., + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + false + }, + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF}, + false + }, + // quality based + { + -99., + null, + new GATKSVVCFConstants.EvidenceTypes[]{}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + true + }, + { + null, + -99., + new GATKSVVCFConstants.EvidenceTypes[]{}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + false + }, + { + -10., + -9., + new GATKSVVCFConstants.EvidenceTypes[]{}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + true + }, + { + -10., + -9., + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + true + }, + // SR > PE + { + -99., + -99., + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + true + }, + // note quality null = 0 + { + null, + 0., + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE}, + true + }, + { + 0., + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD}, + true + }, + { + null, + 0., + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF}, + true + }, + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF}, + true + }, + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR}, + false + }, + // PE > others + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + true + }, + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD}, + true + }, + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE}, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF}, + true + }, + // irrelevant evidence, expect second with more carriers + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + false + }, + { + null, + null, + new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF}, + new GATKSVVCFConstants.EvidenceTypes[]{}, + false + }, + }; + } + + @Test(dataProvider = "collapseIntervalRepresentativeTestData") + public void collapseIntervalRepresentativeTest(final Double log10PErrorA, + final Double log10PErrorB, + final GATKSVVCFConstants.EvidenceTypes[] evidenceA, + final GATKSVVCFConstants.EvidenceTypes[] evidenceB, + final boolean expectFirst) { // Choose second record with more carriers final List records = Lists.newArrayList( - SVTestUtils.makeRecord("record1", "chr1", 1000, true, + SVTestUtils.makeRecordWithEvidenceAndQuality("record1", "chr1", 1000, true, "chr1", 2000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, - null, Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), + null, Arrays.asList(evidenceA), Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Lists.newArrayList( new GenotypeBuilder("sample1", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2), new GenotypeBuilder("sample2", Lists.newArrayList(Allele.REF_N, Allele.REF_N)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2) - ) + ), + log10PErrorA ), - SVTestUtils.makeRecord("record2", "chr1", 1001, true, + SVTestUtils.makeRecordWithEvidenceAndQuality("record2", "chr1", 1001, true, "chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, - null, Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), + null, Arrays.asList(evidenceB), Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Lists.newArrayList( new GenotypeBuilder("sample1", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2), new GenotypeBuilder("sample2", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2) - ) + ), + log10PErrorB ) ); final Pair result = collapserRepresentative.collapseInterval(records); - Assert.assertEquals((int) result.getLeft(), 1001); - Assert.assertEquals((int) result.getRight(), 2001); + if (expectFirst) { + Assert.assertEquals((int) result.getLeft(), 1000); + Assert.assertEquals((int) result.getRight(), 2000); + } else { + Assert.assertEquals((int) result.getLeft(), 1001); + Assert.assertEquals((int) result.getRight(), 2001); + } + } + @Test + public void collapseIntervalRepresentativeByCoordinatesTest() { // record2 and record3 have the best carrier status, but choose second record which is closer to all others on average final List records2 = Lists.newArrayList( @@ -1483,9 +1817,9 @@ public void collapseIntervalRepresentativeTest() { ) ); final Pair result2 = collapserRepresentative.collapseInterval(records2); - Assert.assertEquals((int) result2.getLeft(), 999); - Assert.assertEquals((int) result2.getRight(), 2000); - } + Assert.assertEquals((int) result2.getLeft(), 999); + Assert.assertEquals((int) result2.getRight(), 2000); +} @DataProvider(name = "distanceDataProvider") public Object[][] distanceDataProvider() { @@ -1583,7 +1917,7 @@ public void testComplexSubtypeAndIntervals() { GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:6000-8000", SVTestUtils.hg38Dict)), - null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord cpx2 = new SVCallRecord("cpx1", "chr1", 1000, null, @@ -1591,7 +1925,7 @@ public void testComplexSubtypeAndIntervals() { GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:6000-8000", SVTestUtils.hg38Dict)), - null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord result = collapser.collapse(new SVClusterEngine.OutputCluster(Lists.newArrayList(cpx1, cpx2))); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java index 3b51938e997..a3f768c26bb 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java @@ -1,7 +1,10 @@ package org.broadinstitute.hellbender.tools.sv.cluster; import com.google.common.collect.Lists; -import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.GenotypesContext; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; import org.broadinstitute.hellbender.tools.sv.SVCallRecord; import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; @@ -50,7 +53,7 @@ private static Integer inferLength(final String contigA, final int posA, final S @BeforeTest public void initializeClusterEngine() { - engine.add(SVTestUtils.call1); + engine.addAndFlush(SVTestUtils.call1); linkageSizeSimilarity.setDepthOnlyParams(depthOnlyParametersSizeSimilarity); linkageSizeSimilarity.setMixedParams(mixedParametersSizeSimilarity); linkageSizeSimilarity.setEvidenceParams(evidenceParametersSizeSimilarity); @@ -167,12 +170,12 @@ public void testClusterTogetherInvalidInterval() { // End position beyond contig end after padding final SVCallRecord deletion1 = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 248956423 + SVTestUtils.defaultEvidenceParameters.getWindow(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord deletion2 = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 248956422, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); engine.getLinkage().areClusterable(deletion1, deletion2); @@ -204,7 +207,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start, final int end, final String algorithm) { final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start, true, "chr1", end, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - end - start + 1, Collections.singletonList(algorithm), + end - start + 1, Collections.emptyList(), Collections.singletonList(algorithm), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final int maxClusterableStart = engine.getLinkage().getMaxClusterableStartingPosition(call1); @@ -212,12 +215,12 @@ private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start, final int call2Start = maxClusterableStart; final SVCallRecord call2Depth = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - call1.getLength(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + call1.getLength(), Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord call2Pesr = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - call1.getLength(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, + call1.getLength(), Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); Assert.assertTrue(engine.getLinkage().areClusterable(call1, call2Depth) || engine.getLinkage().areClusterable(call1, call2Pesr)); @@ -225,12 +228,12 @@ private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start, final int call3Start = maxClusterableStart + 1; final SVCallRecord call3Depth = new SVCallRecord("call2", "chr1", call3Start, true, "chr1", call3Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - call1.getLength(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + call1.getLength(), Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord call3Pesr = new SVCallRecord("call2", "chr1", call3Start, true, "chr1", call3Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - call1.getLength(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, + call1.getLength(), Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); Assert.assertFalse(engine.getLinkage().areClusterable(call1, call3Depth) || engine.getLinkage().areClusterable(call1, call3Pesr)); @@ -286,12 +289,12 @@ public Object[][] clusterTogetherVaryPositionsProvider() { public void testClusterTogetherVaryPositions(final int start1, final int end1, final int start2, final int end2, final boolean result) { final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start1, true, "chr1", end1, false, - GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end1 - start1 + 1, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end1 - start1 + 1, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), SVTestUtils.threeGenotypes, Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord call2 = new SVCallRecord("call2", "chr1", start2, true, "chr1", end2, false, - GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end2 - start2 + 1, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end2 - start2 + 1, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), SVTestUtils.threeGenotypes, Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), result); @@ -303,12 +306,12 @@ public void testClusterTogetherVaryTypes() { // Pass in null strands to let them be determined automatically final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, SVTestUtils.getValidTestStrandA(type1), "chr1", 2001, SVTestUtils.getValidTestStrandB(type1), type1, null, Collections.emptyList(), - SVTestUtils.getLength(1000, 2001, type1), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), + SVTestUtils.getLength(1000, 2001, type1), Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); for (final GATKSVVCFConstants.StructuralVariantAnnotationType type2 : GATKSVVCFConstants.StructuralVariantAnnotationType.values()) { final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, SVTestUtils.getValidTestStrandA(type2), "chr1", 2001, SVTestUtils.getValidTestStrandB(type2), type2, null, Collections.emptyList(), - SVTestUtils.getLength(1000, 2001, type2), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), + SVTestUtils.getLength(1000, 2001, type2), Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); // Should only cluster together if same type, except CNVs if ((type1 == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV && call2.isSimpleCNV()) || @@ -328,13 +331,13 @@ public void testClusterTogetherVaryStrands() { for (final Boolean strand1B : bools) { final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, strand1A, "chr1", 2001, strand1B, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), - null, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), + null, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); for (final Boolean strand2A : bools) { for (final Boolean strand2B : bools) { final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, strand2A, "chr1", 2001, strand2B, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), - null, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), + null, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); // Should only cluster if strands match Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), strand1A == strand2A && strand1B == strand2B); @@ -353,7 +356,7 @@ public void testClusterTogetherVaryContigs() { final String contig1B = contigs.get(j); final SVCallRecord call1 = new SVCallRecord("call1", contig1A, 1000, true, contig1B, 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), - null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, + null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); for (int k = 0; k < contigs.size(); k++) { final String contig2A = contigs.get(k); @@ -361,7 +364,7 @@ public void testClusterTogetherVaryContigs() { final String contig2B = contigs.get(m); final SVCallRecord call2 = new SVCallRecord("call2", contig2A, 1000, true, contig2B, 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), - null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, + null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); // Should only cluster if contigs match Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), contig1A.equals(contig2A) && contig1B.equals(contig2B)); @@ -381,11 +384,11 @@ public void testClusterTogetherVaryAlgorithms() { for (final List algorithms1 : algorithmsList) { final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, true, "chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1002, algorithms1, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); + 1002, Collections.emptyList(), algorithms1, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); for (final List algorithms2 : algorithmsList) { final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, true, "chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1002, algorithms2, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); + 1002, Collections.emptyList(), algorithms2, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); // All combinations should cluster Assert.assertTrue(engine.getLinkage().areClusterable(call1, call2)); } @@ -424,6 +427,50 @@ public void testClusterTogetherCNVs() { Assert.assertFalse(engine.getLinkage().areClusterable(del1, dup1)); } + @DataProvider(name = "testMatchCNVNoGTData") + public Object[][] testMatchCNVNoGTData() { + return new Object[][]{ + // Empty + {0, new int[]{}, new int[]{}, true}, + // Both equal + {0, new int[]{0}, new int[]{0}, true}, + {1, new int[]{1}, new int[]{1}, true}, + {2, new int[]{2}, new int[]{2}, true}, + {2, new int[]{3}, new int[]{3}, true}, + // Unequal + {2, new int[]{1}, new int[]{2}, false}, + {2, new int[]{2}, new int[]{1}, false}, + // Equal multiple + {2, new int[]{2, 2}, new int[]{2, 2}, true}, + {2, new int[]{4, 2}, new int[]{4, 2}, true}, + // Unequal multiple + {2, new int[]{2, 2}, new int[]{2, 1}, false}, + {2, new int[]{0, 2}, new int[]{1, 1}, false}, + {2, new int[]{3, 2}, new int[]{2, 2}, false}, + {2, new int[]{6, 2}, new int[]{4, 2}, false}, + }; + } + + @Test(dataProvider= "testMatchCNVNoGTData") + public void testMatchCNVNoGT(final int ploidy, final int[] copyNumbers1, final int[] copyNumbers2, final boolean expected) { + final List alleles = Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_CNV); + final GATKSVVCFConstants.StructuralVariantAnnotationType svtype = GATKSVVCFConstants.StructuralVariantAnnotationType.CNV; + // Create genotypes with copy number attribute (and no GT) + final SVCallRecord recordCN1 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers1, GATKSVVCFConstants.COPY_NUMBER_FORMAT); + final SVCallRecord recordCN2 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers2, GATKSVVCFConstants.COPY_NUMBER_FORMAT); + + // With sample overlap + final ClusteringParameters depthOnlyParams = ClusteringParameters.createDepthParameters(0.8, 0, 10000000, 1); + final CanonicalSVLinkage linkage = new CanonicalSVLinkage<>(SVTestUtils.hg38Dict, false); + linkage.setDepthOnlyParams(depthOnlyParams); + + Assert.assertEquals(linkage.areClusterable(recordCN1, recordCN2), expected); + + final SVCallRecord recordRDCN1 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers1, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT); + final SVCallRecord recordRDCN2 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers2, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT); + Assert.assertEquals(linkage.areClusterable(recordRDCN1, recordRDCN2), expected); + } + @DataProvider(name = "testClusterTogetherIntervaledComplexData") public Object[][] testClusterTogetherIntervaledComplexData() { return new Object[][]{ @@ -510,7 +557,7 @@ public void testClusterTogetherIntervaledComplex(final String contigA, final int GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.delINV, Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DEL_chr1:1100-1500", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("INV_chr1:1600-1900", SVTestUtils.hg38Dict)), - 1000, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final Integer length2 = inferLength(contigA, posA, contigB, posB); @@ -519,7 +566,7 @@ public void testClusterTogetherIntervaledComplex(final String contigA, final int GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, subtype, cpxIntervals, - length2, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + length2, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); Assert.assertEquals(engine.getLinkage().areClusterable(cpx1, cpx2), expected); @@ -588,7 +635,7 @@ public void testClusterTogetherInsertedComplex(final String contigA, final int p GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Arrays.asList(new SVCallRecord.ComplexEventInterval(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, new SimpleInterval("chr1", 6000, 8000))), - 2000, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + 2000, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final Integer length2 = cpxIntervals.get(0).getInterval().size(); @@ -597,7 +644,7 @@ public void testClusterTogetherInsertedComplex(final String contigA, final int p GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, subtype, cpxIntervals, - length2, Collections.singletonList(SVTestUtils.PESR_ALGORITHM), + length2, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM), Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); Assert.assertEquals(engine.getLinkage().areClusterable(cpx1, cpx2), expected); @@ -608,10 +655,10 @@ public void testClusterTogetherVaryParameters() { final SVClusterEngine testEngine1 = SVTestUtils.getNewDefaultSingleLinkageEngine(); final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, true, "chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1002, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); + 1002, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1100, true, "chr1", 2101, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - 1002, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); + 1002, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); // Cluster with default parameters Assert.assertTrue(testEngine1.getLinkage().areClusterable(call1, call2)); final ClusteringParameters exactMatchParameters = ClusteringParameters.createDepthParameters(1.0, 0, 0, 1.0); @@ -650,20 +697,22 @@ public void testAddVaryPositions(final int positionA1, final int positionB1, } final SVCallRecord call1 = new SVCallRecord("call1", "chr1", positionA1, true, "chr1", positionB1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - positionB1 - positionA1 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + positionB1 - positionA1 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord call2 = new SVCallRecord("call1", "chr1", positionA2, true, "chr1", positionB2, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - positionB2 - positionA2 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + positionB2 - positionA2 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final SVCallRecord call3 = new SVCallRecord("call1", "chr1", positionA3, true, "chr1", positionB3, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), - positionB3 - positionA3 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + positionB3 - positionA3 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); - engine.add(call1); - engine.add(call2); - engine.add(call3); - Assert.assertEquals(engine.forceFlush().size(), result); + final List output = new ArrayList<>(); + output.addAll(engine.addAndFlush(call1)); + output.addAll(engine.addAndFlush(call2)); + output.addAll(engine.addAndFlush(call3)); + output.addAll(engine.flush()); + Assert.assertEquals(output.size(), result); } @Test @@ -671,22 +720,24 @@ public void testAdd() { //single-sample merge case, ignoring sample sets final SVClusterEngine temp1 = SVTestUtils.getNewDefaultSingleLinkageEngine(); Assert.assertTrue(temp1.isEmpty()); - temp1.add(SVTestUtils.call1); + final List output1 = new ArrayList<>(); + output1.addAll(temp1.addAndFlush(SVTestUtils.call1)); Assert.assertFalse(temp1.isEmpty()); //force new cluster by adding a non-overlapping event - temp1.add(SVTestUtils.call3); - final List output1 = temp1.forceFlush(); //flushes all clusters + output1.addAll(temp1.addAndFlush(SVTestUtils.call3)); + output1.addAll(temp1.flush()); //flushes all clusters Assert.assertTrue(temp1.isEmpty()); Assert.assertEquals(output1.size(), 2); SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0)); SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call3, output1.get(1)); final SVClusterEngine temp2 = SVTestUtils.getNewDefaultSingleLinkageEngine(); - temp2.add(SVTestUtils.call1); - temp2.add(SVTestUtils.overlapsCall1); + final List output2 = new ArrayList<>(); + output2.addAll(temp2.addAndFlush(SVTestUtils.call1)); + output2.addAll(temp2.addAndFlush(SVTestUtils.overlapsCall1)); //force new cluster by adding a call on another contig - temp2.add(SVTestUtils.call4_chr10); - final List output2 = temp2.forceFlush(); + output2.addAll(temp2.addAndFlush(SVTestUtils.call4_chr10)); + output2.addAll(temp2.flush()); Assert.assertEquals(output2.size(), 2); //median of two items ends up being the second item here Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA()); @@ -695,9 +746,10 @@ public void testAdd() { //checking insensitivity to sample set overlap final SVClusterEngine temp3 = SVTestUtils.getNewDefaultSingleLinkageEngine(); - temp3.add(SVTestUtils.call1); - temp3.add(SVTestUtils.sameBoundsSampleMismatch); - final List output3 = temp3.forceFlush(); + final List output3 = new ArrayList<>(); + output3.addAll(temp3.addAndFlush(SVTestUtils.call1)); + output3.addAll(temp3.addAndFlush(SVTestUtils.sameBoundsSampleMismatch)); + output3.addAll(temp3.flush()); Assert.assertEquals(output3.size(), 1); Assert.assertEquals(output3.get(0).getPositionA(), SVTestUtils.call1.getPositionA()); Assert.assertEquals(output3.get(0).getPositionB(), SVTestUtils.call1.getPositionB()); @@ -710,12 +762,13 @@ public void testAddMaxCliqueLarge() { final int numRecords = 100; final SVClusterEngine engine = SVTestUtils.getNewDefaultMaxCliqueEngine(); final int length = 5000; + final List result = new ArrayList<>(); for (int i = 0; i < numRecords; i++) { final int start = 1000 + 10 * i; final int end = start + length - 1; - engine.add(SVTestUtils.newPESRCallRecordWithIntervalAndType(start, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)); + result.addAll(engine.addAndFlush(SVTestUtils.newPESRCallRecordWithIntervalAndType(start, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); } - final List result = engine.forceFlush(); + result.addAll(engine.flush()); Assert.assertEquals(result.size(), 50); for (final SVCallRecord resultRecord : result) { Assert.assertTrue(resultRecord.getAttributes().containsKey(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); @@ -778,21 +831,14 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll } // Create genotypes with copy number attribute (and no GT) - final List genotypesWithCopyNumber = IntStream.range(0, copyNumbers.length) - .mapToObj(i -> new GenotypeBuilder(String.valueOf(i)) - .attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumbers[i]) - .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy) - .alleles(SVTestUtils.buildHomAlleleListWithPloidy(Allele.NO_CALL, ploidy)) - .make()) - .collect(Collectors.toList()); - final SVCallRecord recordWithCopyNumber = new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype), - "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), - alleles, GenotypesContext.copy(genotypesWithCopyNumber), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); - final Set resultWithCopyNumber = recordWithCopyNumber.getCarrierSampleSet(); - + final SVCallRecord recordCN = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers, GATKSVVCFConstants.COPY_NUMBER_FORMAT); + final Set resultWithCopyNumber = recordCN.getCarrierSampleSet(); Assert.assertEquals(resultWithCopyNumber, expectedResult); + final SVCallRecord recordRDCN = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT); + final Set resultWithRDCopyNumber = recordRDCN.getCarrierSampleSet(); + Assert.assertEquals(resultWithRDCopyNumber, expectedResult); + // Create genotypes with GT (and no copy number attribute) final List genotypesWithGenotype = IntStream.range(0, copyNumbers.length) .mapToObj(i -> new GenotypeBuilder(String.valueOf(i)) @@ -802,13 +848,29 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll .collect(Collectors.toList()); final SVCallRecord recordWithGenotype = new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype), "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(), - 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), alleles, GenotypesContext.copy(genotypesWithGenotype), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); final Set resultWithGenotype = recordWithGenotype.getCarrierSampleSet(); Assert.assertEquals(resultWithGenotype, expectedResult); } + private SVCallRecord getCNVRecordWithCN(final int ploidy, List alleles, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype, + final int[] copyNumbers, final String cnField) { + // Create genotypes with copy number attribute (and no GT) + final List genotypesWithCopyNumber = IntStream.range(0, copyNumbers.length) + .mapToObj(i -> new GenotypeBuilder(String.valueOf(i)) + .attribute(cnField, copyNumbers[i]) + .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy) + .alleles(SVTestUtils.buildHomAlleleListWithPloidy(Allele.NO_CALL, ploidy)) + .make()) + .collect(Collectors.toList()); + return new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype), + "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(), + 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + alleles, GenotypesContext.copy(genotypesWithCopyNumber), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict); + } + @Test public void testLargeRandom() { final Random rand = new Random(42); @@ -819,8 +881,14 @@ public void testLargeRandom() { records.add(SVTestUtils.newPESRCallRecordWithIntervalAndType(Math.min(pos1, pos2), Math.max(pos1, pos2), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)); } final SVClusterEngine engine = SVTestUtils.getNewDefaultMaxCliqueEngine(); - records.stream().sorted(SVCallRecordUtils.getCallComparator(SVTestUtils.hg38Dict)).forEach(engine::add); - final List output = engine.forceFlush(); + final List output = new ArrayList<>( + records.stream() + .sorted(SVCallRecordUtils.getCallComparator(SVTestUtils.hg38Dict)) + .map(engine::addAndFlush) + .flatMap(List::stream) + .collect(Collectors.toUnmodifiableList()) + ); + output.addAll(engine.flush()); Assert.assertEquals(output.size(), 2926); } } \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java new file mode 100644 index 00000000000..c6a7fa9f403 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java @@ -0,0 +1,554 @@ +package org.broadinstitute.hellbender.tools.sv.stratify; + +import com.google.common.collect.Lists; +import htsjdk.samtools.util.Locatable; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.tools.sv.SVTestUtils; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; +import java.util.stream.Collectors; + +public class SVStratificationEngineUnitTest extends GATKBaseTest { + + private static final GATKPath CONFIG_FILE_PATH = new GATKPath(toolsTestDir + "/sv/sv_stratify_config.tsv"); + + private static final String CONTEXT_1_NAME = "context1"; + private static final String CONTEXT_2_NAME = "context2"; + + private static final List CONTEXT_1_INTERVALS = Lists.newArrayList(new SimpleInterval("chr1", 1000, 2000)); + private static final List CONTEXT_2_INTERVALS = Lists.newArrayList(new SimpleInterval("chr2", 1000, 2000)); + + private static SVStatificationEngine makeDefaultEngine() { + return new SVStatificationEngine(SVTestUtils.hg38Dict); + } + + @Test + public void testAddContext() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + Assert.assertNotNull(engine.getTrackIntervals(CONTEXT_1_NAME)); + Assert.assertNull(engine.getTrackIntervals(CONTEXT_2_NAME)); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testAddDuplicateContext() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_2_INTERVALS); + } + + @Test + public void testNoContexts() { + final SVStatificationEngine engine = makeDefaultEngine(); + Assert.assertTrue(engine.getStrata().isEmpty()); + } + + @Test + public void testAddStratification() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton(CONTEXT_1_NAME)); + final Collection stratificationCollection = engine.getStrata(); + Assert.assertNotNull(stratificationCollection); + Assert.assertEquals(stratificationCollection.size(), 1); + final SVStatificationEngine.Stratum stratification = stratificationCollection.iterator().next(); + Assert.assertNotNull(stratification); + Assert.assertEquals(stratification.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL); + Assert.assertNotNull(stratification.getMinSize()); + Assert.assertEquals(stratification.getMinSize().intValue(), 50); + Assert.assertNotNull(stratification.getMaxSize()); + Assert.assertEquals(stratification.getMaxSize().intValue(), 500); + Assert.assertEquals(stratification.getTrackNames().size(), 1); + Assert.assertEquals(stratification.getTrackNames().get(0), CONTEXT_1_NAME); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testAddStratificationBadMinSize() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, -1, 500, Collections.emptySet()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testAddStratificationBadMaxSize() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, -1, Collections.emptySet()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testAddStratificationBadMaxSizeInfinity() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Integer.MAX_VALUE, Collections.emptySet()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testAddStratificationMaxEqualToMin() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 50, Collections.emptySet()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testAddStratificationMaxLessThanMin() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 49, Collections.emptySet()); + } + + @Test + public void testCreate() { + final Map> map = new HashMap<>(); + map.put(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + map.put(CONTEXT_2_NAME, CONTEXT_2_INTERVALS); + final SVStatificationEngine engine = SVStatificationEngine.create(map, CONFIG_FILE_PATH, SVTestUtils.hg38Dict); + Assert.assertNotNull(engine); + Assert.assertNotNull(engine.getTrackIntervals(CONTEXT_1_NAME)); + Assert.assertEquals(engine.getStrata().size(), 7); + } + + @DataProvider(name="testGetMatchVariantsData") + public Object[][] testGetMatchVariantsData() { + return new Object[][] { + + // DEL + + // Outside context interval + { "chr1", 100, "chr1", 200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null }, + { "chr1", 2000, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null }, + // Simple match + { "chr1", 1100, "chr1", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr1", 900, "chr1", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr1", 900, "chr1", 1900, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr1", 1100, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr1", 800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr1", 999, "chr1", 2001, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr2", 1100, "chr2", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + // Wrong contig + { "chr3", 1100, "chr3", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null }, + // Barely match + { "chr1", 1000, "chr1", 3001, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr1", 2, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + { "chr1", 500, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + // Barely miss overlap threshold + { "chr1", 1000, "chr1", 3002, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null }, + // Barely large enough + { "chr1", 1100, "chr1", 1149, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" }, + // Too small + { "chr1", 1100, "chr1", 1148, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null }, + + // INV (null context) + + // Right size + { "chr1", 1001, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" }, + { "chr1", 4001, "chr1", 5000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" }, + { "chr2", 10000, "chr2", 20000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" }, + // Too small + { "chr1", 1001, "chr1", 1999, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, null }, + { "chr1", 100, "chr1", 200, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, null }, + + // INS + + // In context + { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 100, "INS_context1" }, + // SVLEN should not matter + { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 1, "INS_context1" }, + { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 10000, "INS_context1" }, + // Out of context + { "chr1", 100, "chr1", 100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 100, null }, + // Out of size range for context2 + { "chr2", 1100, "chr2", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 1000, null }, + { "chr2", 1100, "chr2", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 400, null }, + + // BND + + // Both ends + { "chr1", 1000, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, "BND_context1" }, + { "chr1", 2000, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, "BND_context1" }, + // One end only + { "chr1", 500, "chr1", 900, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null }, + { "chr1", 1500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null }, + // No ends + { "chr1", 500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null }, + + // BND (same as CTX) + + // Both ends + { "chr1", 1000, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, "CTX_context1" }, + { "chr1", 2000, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, "CTX_context1" }, + // One end only + { "chr1", 500, "chr1", 900, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null }, + { "chr1", 1500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null }, + // No ends + { "chr1", 500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null }, + }; + } + + @Test(dataProvider = "testGetMatchVariantsData") + public void testGetMatchVariants(final String chromA, final int posA, final String chromB, final int posB, + final GATKSVVCFConstants.StructuralVariantAnnotationType svType, + final Integer svlen, + final String expectedStratName) { + final Map> map = new HashMap<>(); + map.put(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + map.put(CONTEXT_2_NAME, CONTEXT_2_INTERVALS); + final SVStatificationEngine engine = SVStatificationEngine.create(map, CONFIG_FILE_PATH, SVTestUtils.hg38Dict); + final SVCallRecord record; + if (svType == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) { + record = SVTestUtils.newCallRecordInsertionWithLengthAndCoordinates(chromA, posA, svlen); + } else { + record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", chromA, posA, chromB, posB, svType); + } + final Collection result = engine.getMatches(record, 0.5, 0, 2); + if (expectedStratName == null) { + Assert.assertTrue(result.isEmpty()); + } else { + Assert.assertFalse(result.isEmpty()); + Assert.assertEquals(result.iterator().next().getName(), expectedStratName); + } + } + + // Not supported + @Test(expectedExceptions = GATKException.class) + public void testGetMatchVariantsCpx() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + engine.addTrack("context3", Lists.newArrayList(new SimpleInterval("chr1", 1500, 2500))); + engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, 50, 500, Collections.singleton("context1")); + engine.addStratification("strat2", GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, 50, 500, Collections.singleton("context3")); + final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr1", 1800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX); + // Should throw error + engine.getMatches(record, 0.5, 0, 2); + } + + @Test + public void testGetMatchVariantsMultiple() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + engine.addTrack("context3", Lists.newArrayList(new SimpleInterval("chr1", 1500, 2500))); + engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton("context1")); + engine.addStratification("strat2", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton("context3")); + final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr1", 1800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL); + final Collection result = engine.getMatches(record, 0.5, 0, 2); + final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()); + Assert.assertTrue(names.contains("strat1")); + Assert.assertTrue(names.contains("strat2")); + } + + @Test + public void testGetMatchVariantsNullContexts() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.emptySet()); + final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr2", 1800, "chr2", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL); + final Collection result = engine.getMatches(record, 0.5, 0, 2); + final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()); + Assert.assertEquals(names.size(), 1); + Assert.assertEquals(names.get(0), "strat1"); + } + + @Test + public void testGetMatchVariantsNoEngineContexts() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.emptySet()); + final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr2", 1800, "chr2", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL); + final Collection result = engine.getMatches(record, 0.5, 0, 2); + final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()); + Assert.assertEquals(names.size(), 1); + Assert.assertEquals(names.get(0), "strat1"); + } + + @Test + public void testTestAddStratificationInnerClass() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + final SVStatificationEngine.Stratum stratification = engine.new Stratum("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton(CONTEXT_1_NAME)); + engine.addStratification(stratification); + final Collection stratificationCollection = engine.getStrata(); + Assert.assertNotNull(stratificationCollection); + Assert.assertEquals(stratificationCollection.size(), 1); + final SVStatificationEngine.Stratum stratificationOut = stratificationCollection.iterator().next(); + Assert.assertNotNull(stratificationOut); + Assert.assertEquals(stratificationOut.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL); + Assert.assertNotNull(stratificationOut.getMinSize()); + Assert.assertEquals(stratificationOut.getMinSize().intValue(), 50); + Assert.assertNotNull(stratificationOut.getMaxSize()); + Assert.assertEquals(stratificationOut.getMaxSize().intValue(), 500); + Assert.assertEquals(stratificationOut.getTrackNames().size(), 1); + Assert.assertEquals(stratificationOut.getTrackNames().get(0), CONTEXT_1_NAME); + } + + @Test + public void testMatchesType() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + 100, 500, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesType(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertFalse(strat.matchesType(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP))); + } + + @Test + public void testMatchesSizeSimple() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + 100, 500, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(50, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + } + + @Test + public void testMatchesSizeNoMin() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + null, 500, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + } + + @Test + public void testMatchesSizeNoMax() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + 50, null, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(49, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(Integer.MAX_VALUE - 1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + } + + @Test + public void testMatchesSizeNoMinOrMax() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + null, null, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(Integer.MAX_VALUE - 1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL))); + } + + @Test + public void testMatchesSizeInsertion() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.INS, + 100, 500, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS))); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.INS))); + Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(50, GATKSVVCFConstants.StructuralVariantAnnotationType.INS))); + Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.INS))); + } + + @Test + public void testMatchesSizeInsertionNullLength() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.INS, + 0, Integer.MAX_VALUE - 1, + Collections.emptySet() + ); + Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS))); + } + + @Test + public void testMatchesSizeInsertionNullLength2() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.INS, + null, null, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS))); + } + + @Test + public void testMatchesSizeBnd() { + final SVStatificationEngine engine = makeDefaultEngine(); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.BND, + null, null, + Collections.emptySet() + ); + Assert.assertTrue(strat.matchesSize(SVTestUtils.newBndCallRecordWithStrands(true, false))); + } + + + @DataProvider(name="testMatchesContextDelData") + public Object[][] testMatchesContextDelData() { + return new Object[][] { + // Outside context interval + { "chr1", 1000, 1500, 0.5, 0, true }, + { "chr1", 500, 1500, 0.5, 0, true }, + { "chr1", 499, 1499, 0.5, 0, false }, + { "chr1", 900, 1300, 0.5, 1, true }, + { "chr1", 1999, 2000000, 0, 1, true }, + { "chr1", 500, 600, 0, 2, false }, + { "chr1", 500, 1100, 0, 2, false }, + { "chr1", 1100, 1200, 0, 2, true }, + { "chr1", 1100, 1200, 1, 2, true } + }; + } + + @Test(dataProvider = "testMatchesContextDelData") + public void testMatchesContextDel(final String chrom, final int start, final int end, + final double overlapFraction, final int numBreakpointOverlaps, + final boolean expected) { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.BND, + null, null, + Collections.singleton(CONTEXT_1_NAME) + ); + Assert.assertEquals(strat.matchesTracks(SVTestUtils.newCallRecordWithCoordinatesAndType("record", chrom, start, chrom, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL), + overlapFraction, numBreakpointOverlaps, 1), expected); + } + + @DataProvider(name="testMatchesContextInsData") + public Object[][] testMatchesContextInsData() { + return new Object[][] { + // Outside context interval + { "chr1", 1100, 100, 0.1, 0, true }, + { "chr1", 1100, 100000, 0.1, 0, true }, + { "chr1", 999, 100, 0.1, 0, false } + }; + } + + @Test(dataProvider = "testMatchesContextInsData") + public void testMatchesContextIns(final String chrom, final int start, final int length, + final double overlapFraction, final int numBreakpointOverlaps, + final boolean expected) { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.BND, + null, null, + Collections.singleton(CONTEXT_1_NAME) + ); + Assert.assertEquals(strat.matchesTracks(SVTestUtils.newCallRecordInsertionWithLengthAndCoordinates(chrom, start, length), + overlapFraction, numBreakpointOverlaps, 1), expected); + } + + @DataProvider(name="testMatchesContextBndData") + public Object[][] testMatchesContextBndData() { + return new Object[][] { + { "chr1", 999, "chr1", 2001, 1, false }, + { "chr1", 1000, "chr1", 1200, 1, true }, + { "chr1", 1000, "chr1", 50000, 1, true }, + { "chr1", 1000, "chr1", 1000, 1, true }, + { "chr1", 500, "chr1", 1000, 1, true }, + { "chr1", 1000, "chr1", 1999, 2, true }, + { "chr1", 1000, "chr1", 2000, 2, true }, + { "chr1", 1000, "chr2", 1000, 2, false }, + { "chr1", 1000, "chr1", 2001, 2, false }, + { "chr1", 999, "chr1", 1000, 2, false } + }; + } + + @Test(dataProvider = "testMatchesContextBndData") + public void testMatchesContextBnd(final String chromA, final int posA, final String chromB, final int posB, + final int numBreakpointOverlapsInterchrom, final boolean expected) { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.BND, + null, null, + Collections.singleton(CONTEXT_1_NAME) + ); + Assert.assertEquals(strat.matchesTracks(SVTestUtils.newCallRecordWithCoordinatesAndType("record", chromA, posA, chromB, posB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND), + 0.5, 2, numBreakpointOverlapsInterchrom), expected); + } + + @DataProvider(name="testCountAnyContextOverlapData") + public Object[][] testCountAnyContextOverlapData() { + return new Object[][] { + { "chr1", 500, 1500, 1 }, + { "chr1", 1000, 2000, 1 }, + { "chr1", 1500, 2500, 1 }, + { "chr1", 500, 2500, 1 }, + { "chr1", 1100, 1900, 1 }, + { "chr1", 999, 999, 0 }, + { "chr1", 999, 1000, 1 }, + { "chr1", 1000, 1000, 1 }, + { "chr1", 1000, 1001, 1 }, + { "chr2", 1000, 1001, 0 }, + { "chr1", 1999, 2000, 1 }, + { "chr1", 2000, 2000, 1 }, + { "chr1", 2001, 2001, 0 } + }; + } + + @Test(dataProvider = "testCountAnyContextOverlapData") + public void testCountAnyContextOverlap(final String chrom, final int start, final int end, final int expected) { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.BND, + null, null, + Collections.singleton(CONTEXT_1_NAME) + ); + Assert.assertEquals(strat.countAnyTrackOverlap(new SimpleInterval(chrom, start, end)), expected); + } + + @DataProvider(name="testIsMutuallyExclusiveData") + public Object[][] testIsMutuallyExclusiveData() { + return new Object[][] { + {GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null, null, null, + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null, null, null, + false}, + }; + } + + @Test + public void testGetters() { + final SVStatificationEngine engine = makeDefaultEngine(); + engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS); + final SVStatificationEngine.Stratum strat = engine.new Stratum( + "strat", + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + 50, 500, + Collections.singleton(CONTEXT_1_NAME) + ); + Assert.assertEquals(strat.getTrackNames().size(), 1); + Assert.assertEquals(strat.getTrackNames().get(0), CONTEXT_1_NAME); + Assert.assertEquals(strat.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL); + Assert.assertEquals(strat.getMinSize(), Integer.valueOf(50)); + Assert.assertEquals(strat.getMaxSize(), Integer.valueOf(500)); + Assert.assertEquals(strat.getName(), "strat"); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java new file mode 100644 index 00000000000..734423bdd65 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java @@ -0,0 +1,112 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.StructuralVariantType; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.List; + +public class GroupedSVClusterIntegrationTest extends CommandLineProgramTest { + + @Test + public void testClusterStratified() { + final File output = createTempFile("single_linkage_cluster", ".vcf"); + + final String clusteringConfigFile = getToolTestDataDir() + "stratified_cluster_params.tsv"; + final String stratifyConfigFile = getToolTestDataDir() + "stratified_cluster_strata.tsv"; + final String segdupFile = getToolTestDataDir() + "../SVStratify/hg38.SegDup.chr22.bed"; + final String segdupName = "SD"; + final String repeatmaskerFile = getToolTestDataDir() + "../SVStratify/hg38.RM.chr22_subsampled.bed"; + final String repeatmaskerName = "RM"; + + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .addVCF(getToolTestDataDir() + "../SVStratify/bwa_melt.chr22.vcf.gz") + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "../SVCluster/1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE) + .add(GroupedSVCluster.CLUSTERING_CONFIG_FILE_LONG_NAME, clusteringConfigFile) + .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, stratifyConfigFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile) + .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5) + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, GATKBaseTest.hg38Reference); + + runCommandLine(args, GroupedSVCluster.class.getSimpleName()); + + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final List records = vcf.getValue(); + + Assert.assertEquals(records.size(), 1437); + + // Check for specific records + int expectedRecordsFound = 0; + for (final VariantContext variant : records) { + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY)); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE)); + if (variant.getID().equals("SVx00000032")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getContig(), "chr22"); + Assert.assertEquals(variant.getStart(), 11628747); + Assert.assertEquals(variant.getEnd(), 11629803); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 2); + Assert.assertTrue(algorithms.contains("manta")); + Assert.assertTrue(algorithms.contains("wham")); + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 2); + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DEL); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DEL); + Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), "DEL_50_5k_SD_RM"); + } else if (variant.getID().equals("SVx00000125")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getContig(), "chr22"); + Assert.assertEquals(variant.getStart(), 22563654); + Assert.assertEquals(variant.getEnd(), 22567049); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 1); + Assert.assertTrue(algorithms.contains("manta")); + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 1); + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DEL); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DEL); + Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), SVStratify.DEFAULT_STRATUM); + } else if (variant.getID().equals("SVx000001dc")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getContig(), "chr22"); + Assert.assertEquals(variant.getStart(), 26060912); + Assert.assertEquals(variant.getEnd(), 26060989); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 1); + Assert.assertTrue(algorithms.contains("manta")); + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 1); + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DUP); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DUP); + Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), SVStratify.DEFAULT_STRATUM); + } + } + Assert.assertEquals(expectedRecordsFound, 3); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java index 8f27d2a6389..136e0c36ff0 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java @@ -234,7 +234,6 @@ public void testOverlappingEvents(final List inputVcfs) { //in NA11829 variant events are not overlapping, so there should be a CN2 homRef in between final List samplesWithOverlaps = Arrays.asList("HG00365", "HG01789", "HG02221", "NA07357", "NA12005", "NA12873", "NA18997", "NA19428", "NA21120"); - final List samplesWithGaps = Arrays.asList("NA11829"); //all of these samples have an event that overlaps the next event, which is not called in that sample boolean sawVariant; @@ -254,18 +253,18 @@ public void testOverlappingEvents(final List inputVcfs) { } //these samples have a variant that doesn't overlap the next call - for (final String sample : samplesWithGaps) { - sawVariant = false; - for (final VariantContext vc : overlappingEvents.getRight()) { - if (!sawVariant && !vc.getGenotype(sample).isHomRef()) { - sawVariant = true; - continue; - } - if (sawVariant) { - Assert.assertTrue(vc.getGenotype(sample).isHomRef() - && (Integer.parseInt(vc.getGenotype(sample).getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT).toString()) == 2)); - break; - } + sawVariant = false; + for (final VariantContext vc : overlappingEvents.getRight()) { + final Genotype genotype = vc.getGenotype("NA11829"); + if (!sawVariant && !genotype.isHomRef()) { + sawVariant = true; + continue; + } + if (sawVariant && vc.getEnd() == 23236095) { + // Smaller variant nested inside larger hom-var DEL: hom-ref genotype but CN is 0 since it overlaps + Assert.assertTrue(genotype.isHomRef() + && (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT).toString()) == 0)); + break; } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java index caff0aa9675..ec3e6e15f96 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java @@ -23,6 +23,7 @@ import org.testng.annotations.Test; import java.io.File; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; @@ -43,7 +44,7 @@ public void testDefragmentation() { .addVCF(inputVcfPath) .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") - .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.DEFRAGMENT_CNV) + .add(SVCluster.ALGORITHM_LONG_NAME, SVClusterWalker.CLUSTER_ALGORITHM.DEFRAGMENT_CNV) .add(SVCluster.DEFRAG_PADDING_FRACTION_LONG_NAME, 0.25) .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0.5); @@ -293,17 +294,23 @@ public void testAgainstSimpleImplementation() { mixedParameters, pesrParameters); - vcfInputFilenames.stream() - .flatMap(vcfFilename -> VariantContextTestUtils.readEntireVCFIntoMemory(getToolTestDataDir() + vcfFilename).getValue().stream()) - .sorted(IntervalUtils.getDictionaryOrderComparator(referenceSequenceFile.getSequenceDictionary())) - .map(v -> SVCallRecordUtils.create(v, SVTestUtils.hg38Dict)) - .forEach(engine::add); + final List expectedRecords = new ArrayList<>(); + expectedRecords.addAll( + vcfInputFilenames.stream() + .flatMap(vcfFilename -> VariantContextTestUtils.readEntireVCFIntoMemory(getToolTestDataDir() + vcfFilename).getValue().stream()) + .sorted(IntervalUtils.getDictionaryOrderComparator(referenceSequenceFile.getSequenceDictionary())) + .map(v -> SVCallRecordUtils.create(v, SVTestUtils.hg38Dict)) + .map(engine::addAndFlush) + .flatMap(List::stream) + .collect(Collectors.toList()) + ); + expectedRecords.addAll(engine.flush()); - final Comparator recordComparator = SVCallRecordUtils.getCallComparator(referenceSequenceFile.getSequenceDictionary()); - final List expectedVariants = engine.forceFlush().stream() - .sorted(recordComparator) + final Comparator recordComparator = testVcf.getLeft().getVCFRecordComparator(); + final List expectedVariants = expectedRecords.stream() .map(SVCallRecordUtils::getVariantBuilder) .map(VariantContextBuilder::make) + .sorted(recordComparator) .collect(Collectors.toList()); final List testVariants = testVcf.getValue(); @@ -533,5 +540,34 @@ public void testAllosome() { } Assert.assertEquals(expectedRecordsFound, 1); } + @Test + public void testCleanedVcf() { + final File output = createTempFile("cleaned_vcf_cluster", ".vcf"); + // Note we use very loose clustering criteria on a normal cleaned vcf to ensure some clustering happens + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .addVCF(getToolTestDataDir() + "bwa_melt.cleaned.chr22_chrY.vcf.gz") + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE) + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 10000000) + .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 5000) + .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 5000); + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final VCFHeader header = vcf.getKey(); + Assert.assertEquals(header.getSampleNamesInOrder().size(), 161); + final List records = vcf.getValue(); + Assert.assertEquals(records.size(), 1227); + } } \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java new file mode 100644 index 00000000000..c3badd60497 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java @@ -0,0 +1,208 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import com.google.common.collect.Lists; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection; +import org.testng.Assert; +import org.testng.annotations.Test; +import picard.vcf.VcfUtils; + +import java.io.File; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class SVStratifyIntegrationTest extends CommandLineProgramTest { + + @Test + public void testBwaMeltCohort() { + final File outputDir = createTempDir("stratify"); + final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz"; + final String configFile = getToolTestDataDir() + "test_config.tsv"; + + final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed"; + final String segdupName = "SD"; + final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed"; + final String repeatmaskerName = "RM"; + + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(outputDir) + .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test") + .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true) + .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile) + .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5) + .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT) + .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath); + + runCommandLine(args, SVStratify.class.getSimpleName()); + + final File[] outputFiles = outputDir.listFiles(); + Assert.assertEquals(outputFiles.length, 14); + final Map expectedOutputSuffixes = new HashMap<>(); + expectedOutputSuffixes.put("INS_small_SD", 46); + expectedOutputSuffixes.put("DEL_50_5k_both", 110); + expectedOutputSuffixes.put("DEL_5k_50k_SD", 2); + expectedOutputSuffixes.put("DUP_lt5kb_RM", 0); + expectedOutputSuffixes.put("INV_gt1kb", 26); + expectedOutputSuffixes.put("BND_SD", 77); + expectedOutputSuffixes.put(SVStratify.DEFAULT_STRATUM, 1196); + int numVcfs = 0; + int totalRecords = 0; + for (final File file : outputFiles) { + if (VcfUtils.isVariantFile(file)) { + ++numVcfs; + final Pair> outputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(file.getAbsolutePath()); + boolean foundSuffix = false; + for (final String suffix : expectedOutputSuffixes.keySet()) { + if (file.toString().contains("." + suffix + ".")) { + foundSuffix = true; + for (final VariantContext variant : outputVcf.getRight()) { + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY)); + Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), suffix); + } + final int expectedSize = expectedOutputSuffixes.get(suffix).intValue(); + final int actualSize = outputVcf.getRight().size(); + Assert.assertEquals(actualSize, expectedSize, + "Expected " + expectedSize + " records but found " + actualSize + " in " + suffix); + totalRecords += actualSize; + break; + } + } + Assert.assertTrue(foundSuffix, "Unexpected file suffix: " + file.getAbsolutePath()); + } + } + Assert.assertEquals(numVcfs, 7); + final int numInputRecords = VariantContextTestUtils.readEntireVCFIntoMemory(inputVcfPath).getRight().size(); + Assert.assertEquals(totalRecords, numInputRecords); + } + + @Test + public void testBwaMeltCohortSingleOutput() { + final File outputDir = createTempDir("stratify"); + final File outputFile = outputDir.toPath().resolve("out.vcf.gz").toFile(); + final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz"; + final String configFile = getToolTestDataDir() + "test_config.tsv"; + + final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed"; + final String segdupName = "SD"; + final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed"; + final String repeatmaskerName = "RM"; + + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(outputFile) + .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile) + .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5) + .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT) + .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath); + + runCommandLine(args, SVStratify.class.getSimpleName()); + + final List outputFiles = Lists.newArrayList(outputDir.listFiles()).stream().filter(VcfUtils::isVariantFile).collect(Collectors.toUnmodifiableList()); + Assert.assertEquals(outputFiles.size(), 1); + Assert.assertEquals(outputFiles.get(0).getAbsolutePath(), outputFile.getAbsolutePath()); + final Pair> inputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(inputVcfPath); + final Pair> outputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath()); + Assert.assertEquals(outputVcf.getRight().size(), inputVcf.getRight().size()); + } + + @Test(expectedExceptions = GATKException.class) + public void testBwaMeltCohortRedundant() { + final File outputDir = createTempDir("stratify"); + final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz"; + final String configFile = getToolTestDataDir() + "test_config_redundant.tsv"; + + final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed"; + final String segdupName = "SD"; + final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed"; + final String repeatmaskerName = "RM"; + + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(outputDir) + .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test") + .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true) + .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile) + .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5) + .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT) + .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath); + + runCommandLine(args, SVStratify.class.getSimpleName()); + } + + @Test + public void testBwaMeltCohortBypassRedundant() { + final File outputDir = createTempDir("stratify"); + final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz"; + final String configFile = getToolTestDataDir() + "test_config_redundant.tsv"; + + final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed"; + final String segdupName = "SD"; + final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed"; + final String repeatmaskerName = "RM"; + + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(outputDir) + .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test") + .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true) + .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile) + .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5) + .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT) + .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath) + .addFlag(SVStratify.ALLOW_MULTIPLE_MATCHES_LONG_NAME); + + runCommandLine(args, SVStratify.class.getSimpleName()); + } + + @Test(expectedExceptions = {GATKException.class}) + public void testBwaMeltCohortDuplicateContextName() { + final File outputDir = createTempDir("stratify"); + final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz"; + final String configFile = getToolTestDataDir() + "test_config_duplicate.tsv"; + + final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed"; + final String segdupName = "SD"; + final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed"; + final String repeatmaskerName = "RM"; + + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(outputDir) + .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test") + .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true) + .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile) + .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName) + .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile) + .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5) + .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT) + .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath); + + runCommandLine(args, SVStratify.class.getSimpleName()); + } +} \ No newline at end of file diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv new file mode 100644 index 00000000000..b75575e45ae --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv @@ -0,0 +1,8 @@ +NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS +INS_context1 INS -1 -1 context1 +DEL_50_5k_both DEL 50 5000 context1,context2 +DEL_5k_50k_context1 DEL 5000 50000 context1 +DUP_lt5kb_context1 DUP -1 5000 context1 +INV_gt1kb INV 1000 -1 NULL +BND_context1 BND -1 -1 context1 +CTX_context1 CTX -1 -1 context1 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv new file mode 100644 index 00000000000..b94357460a6 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv @@ -0,0 +1,3 @@ +NAME RECIPROCAL_OVERLAP SIZE_SIMILARITY BREAKEND_WINDOW SAMPLE_OVERLAP +INS_small_SD 0.1 0.5 50 0 +DEL_50_5k_SD_RM 0.1 0.5 1000 0 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv new file mode 100644 index 00000000000..9cb836c9d33 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv @@ -0,0 +1,3 @@ +NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS +INS_small_SD INS -1 -1 SD +DEL_50_5k_SD_RM DEL 50 5000 SD,RM diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz new file mode 100644 index 00000000000..ebf9af067ff Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi new file mode 100644 index 00000000000..c394f2c6989 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz index 0fc84b559b1..8632e8cbcd2 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi index 5827787165f..191d92f3007 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz new file mode 100644 index 00000000000..9a85bb047b6 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi new file mode 100644 index 00000000000..86dc452a4ad Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.RM.chr22_subsampled.bed b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.RM.chr22_subsampled.bed new file mode 100644 index 00000000000..999a8da1b4c --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.RM.chr22_subsampled.bed @@ -0,0 +1,1000 @@ +chr22 10548844 10549072 +chr22 10600155 10600379 +chr22 10609936 10610026 +chr22 10680159 10684708 +chr22 10731111 10731244 +chr22 10782698 10783246 +chr22 11032485 11032795 +chr22 11053141 11053245 +chr22 11056252 11056541 +chr22 11264429 11264489 +chr22 11354245 11354759 +chr22 11480660 11480856 +chr22 11485802 11485889 +chr22 11814558 11816405 +chr22 11877778 11878081 +chr22 11879854 11880179 +chr22 11929231 11929483 +chr22 12066840 12067240 +chr22 12101013 12101573 +chr22 12277861 12278206 +chr22 12310868 12310957 +chr22 12323627 12323767 +chr22 12558845 12559121 +chr22 12574259 12574469 +chr22 12582498 12582554 +chr22 12871618 12872075 +chr22 12880177 12880513 +chr22 12883467 12883802 +chr22 12888865 12889065 +chr22 15223165 15223252 +chr22 15309236 15309325 +chr22 15323774 15323965 +chr22 15485028 15485135 +chr22 15499798 15499999 +chr22 15590783 15591195 +chr22 15594920 15594975 +chr22 15717078 15717160 +chr22 15756004 15756039 +chr22 15771644 15771788 +chr22 15803438 15805331 +chr22 15831943 15832771 +chr22 15856856 15856897 +chr22 16073496 16085556 +chr22 16235053 16241190 +chr22 16341035 16341082 +chr22 16436337 16436641 +chr22 16522669 16522956 +chr22 16569061 16570644 +chr22 16579154 16579393 +chr22 16588039 16588645 +chr22 16624386 16624679 +chr22 16649026 16649063 +chr22 16716314 16716812 +chr22 16733932 16734095 +chr22 16757884 16759904 +chr22 16765013 16766359 +chr22 16773365 16773644 +chr22 16812343 16812932 +chr22 16922577 16923317 +chr22 17053575 17055162 +chr22 17099401 17099577 +chr22 17123718 17123756 +chr22 17123860 17124390 +chr22 17162538 17162584 +chr22 17192233 17192324 +chr22 17219665 17219975 +chr22 17231307 17231631 +chr22 17289154 17289255 +chr22 17307511 17307713 +chr22 17345763 17346002 +chr22 17424135 17424269 +chr22 17502119 17502298 +chr22 17586746 17586774 +chr22 17622685 17622996 +chr22 17631522 17631618 +chr22 17635470 17635585 +chr22 17719382 17720182 +chr22 17740160 17740924 +chr22 17761783 17761859 +chr22 17834434 17834561 +chr22 17889544 17889827 +chr22 17937381 17937690 +chr22 17969044 17969338 +chr22 18006551 18006852 +chr22 18028500 18028590 +chr22 18029082 18029800 +chr22 18175694 18176298 +chr22 18219880 18219923 +chr22 18236756 18237477 +chr22 18350047 18350201 +chr22 18393343 18394705 +chr22 18412200 18412296 +chr22 18519917 18520001 +chr22 18525505 18526658 +chr22 18544911 18545084 +chr22 18602476 18602976 +chr22 18618302 18618432 +chr22 18782455 18782585 +chr22 18789650 18789809 +chr22 18791336 18791659 +chr22 18930115 18930281 +chr22 19020829 19021146 +chr22 19074769 19075115 +chr22 19079365 19079539 +chr22 19122416 19122479 +chr22 19239936 19240262 +chr22 19264258 19264515 +chr22 19285483 19285618 +chr22 19412935 19413146 +chr22 19423144 19423261 +chr22 19425679 19426053 +chr22 19529955 19532958 +chr22 19614049 19614150 +chr22 19616914 19617330 +chr22 19703132 19703437 +chr22 19868564 19868856 +chr22 19881943 19882154 +chr22 19882823 19882847 +chr22 19917357 19917461 +chr22 19958599 19958908 +chr22 19997920 19997950 +chr22 20047588 20048524 +chr22 20090527 20090757 +chr22 20182900 20182943 +chr22 20186025 20186060 +chr22 20220167 20220250 +chr22 20385600 20386176 +chr22 20442982 20443216 +chr22 20671842 20672157 +chr22 20688300 20688577 +chr22 20698245 20698539 +chr22 20716963 20717151 +chr22 20770579 20770632 +chr22 20797635 20797751 +chr22 20902600 20902905 +chr22 20941588 20941824 +chr22 21061659 21062388 +chr22 21132602 21132648 +chr22 21138170 21138284 +chr22 21164124 21165687 +chr22 21245971 21246024 +chr22 21316132 21316331 +chr22 21361921 21362255 +chr22 21400616 21400737 +chr22 21434587 21434702 +chr22 21461288 21461573 +chr22 21463519 21463670 +chr22 21466663 21466779 +chr22 21479125 21479456 +chr22 21553026 21553349 +chr22 21556425 21556761 +chr22 21589142 21589445 +chr22 21639819 21640102 +chr22 21701433 21701463 +chr22 21718903 21719107 +chr22 21775784 21776083 +chr22 21831369 21831481 +chr22 21836073 21836376 +chr22 21848700 21848889 +chr22 21872807 21873246 +chr22 21891165 21891946 +chr22 21905524 21906250 +chr22 22050964 22057732 +chr22 22082044 22083267 +chr22 22121138 22121179 +chr22 22129648 22129755 +chr22 22164599 22164628 +chr22 22218589 22219103 +chr22 22495319 22495644 +chr22 22502784 22503399 +chr22 22504151 22504451 +chr22 22542826 22543564 +chr22 22565742 22566184 +chr22 22577836 22577960 +chr22 22648947 22649064 +chr22 22699544 22699764 +chr22 22763058 22763414 +chr22 22773790 22773969 +chr22 22793388 22793691 +chr22 22821356 22821690 +chr22 22870885 22871146 +chr22 22874608 22875120 +chr22 22897694 22897982 +chr22 23007411 23007437 +chr22 23132162 23132268 +chr22 23155335 23155434 +chr22 23210141 23210438 +chr22 23211497 23211784 +chr22 23320710 23321000 +chr22 23340561 23340866 +chr22 23345739 23345841 +chr22 23354792 23354955 +chr22 23364979 23365063 +chr22 23485148 23485264 +chr22 23529915 23530118 +chr22 23604796 23605118 +chr22 23608743 23608792 +chr22 23623122 23623429 +chr22 23645005 23645150 +chr22 23647367 23648106 +chr22 23648786 23649066 +chr22 23653551 23653810 +chr22 23655810 23655918 +chr22 23656994 23657307 +chr22 23771428 23771724 +chr22 23782016 23782139 +chr22 23804540 23804840 +chr22 23821434 23822002 +chr22 23840019 23840959 +chr22 23844212 23844249 +chr22 23847086 23847387 +chr22 23873302 23873329 +chr22 23897728 23897772 +chr22 23916791 23916844 +chr22 23916853 23917151 +chr22 23935971 23936014 +chr22 23978581 23978867 +chr22 23994211 23994639 +chr22 24047382 24047506 +chr22 24103759 24103790 +chr22 24137140 24137167 +chr22 24173428 24173547 +chr22 24203943 24204595 +chr22 24205135 24205305 +chr22 24210891 24212328 +chr22 24255042 24255345 +chr22 24380932 24381051 +chr22 24383294 24383415 +chr22 24424361 24424398 +chr22 24492032 24492098 +chr22 24551459 24552386 +chr22 24699562 24699706 +chr22 24728884 24729357 +chr22 24839371 24841737 +chr22 24970607 24970624 +chr22 24970876 24970937 +chr22 24976297 24976446 +chr22 25099930 25100029 +chr22 25109577 25109632 +chr22 25127039 25127224 +chr22 25130349 25130382 +chr22 25131042 25131144 +chr22 25153813 25153910 +chr22 25187512 25187886 +chr22 25204147 25204315 +chr22 25222687 25223012 +chr22 25224528 25224774 +chr22 25232497 25232672 +chr22 25236404 25236527 +chr22 25290873 25291544 +chr22 25325040 25325146 +chr22 25333316 25333623 +chr22 25365035 25365464 +chr22 25385771 25385864 +chr22 25387400 25387433 +chr22 25447307 25447615 +chr22 25462934 25463153 +chr22 25468278 25468437 +chr22 25471172 25471244 +chr22 25471678 25471718 +chr22 25489661 25489880 +chr22 25506902 25506943 +chr22 25514575 25514679 +chr22 25522465 25522536 +chr22 25523158 25523453 +chr22 25623794 25623900 +chr22 25684884 25685075 +chr22 25699118 25699192 +chr22 25778974 25779206 +chr22 25779269 25779406 +chr22 25792075 25792124 +chr22 25798550 25798864 +chr22 25801576 25801695 +chr22 25826777 25827082 +chr22 25828077 25828303 +chr22 25840236 25840633 +chr22 25865514 25865758 +chr22 25949759 25949805 +chr22 25955692 25955874 +chr22 26004081 26004166 +chr22 26012056 26012342 +chr22 26018670 26018968 +chr22 26035184 26035595 +chr22 26079579 26079749 +chr22 26105361 26105434 +chr22 26131043 26131182 +chr22 26133372 26133621 +chr22 26167276 26167791 +chr22 26182013 26182155 +chr22 26188864 26188929 +chr22 26189817 26190126 +chr22 26200324 26200423 +chr22 26253919 26254314 +chr22 26309875 26310021 +chr22 26310070 26310590 +chr22 26314441 26314603 +chr22 26348077 26348190 +chr22 26405271 26405417 +chr22 26416724 26416760 +chr22 26425678 26426074 +chr22 26426415 26426809 +chr22 26434712 26434902 +chr22 26497474 26498687 +chr22 26559017 26559708 +chr22 26563156 26563297 +chr22 26653442 26654242 +chr22 26667161 26667265 +chr22 26676627 26676737 +chr22 26721609 26721830 +chr22 26743272 26743453 +chr22 26832554 26832948 +chr22 26872117 26874640 +chr22 26888939 26889127 +chr22 26889140 26889413 +chr22 26896135 26896351 +chr22 26908893 26909291 +chr22 26937973 26938087 +chr22 26959677 26959840 +chr22 26972578 26972768 +chr22 26976839 26977262 +chr22 27054525 27054697 +chr22 27118989 27119358 +chr22 27125182 27125301 +chr22 27149328 27149420 +chr22 27154121 27154413 +chr22 27155709 27156029 +chr22 27203271 27203343 +chr22 27208639 27208846 +chr22 27251103 27251392 +chr22 27254263 27254561 +chr22 27314767 27315129 +chr22 27332833 27332945 +chr22 27397534 27397696 +chr22 27479120 27479142 +chr22 27498898 27498945 +chr22 27544436 27544747 +chr22 27555680 27556723 +chr22 27595098 27595399 +chr22 27604216 27604320 +chr22 27615766 27615843 +chr22 27710176 27710269 +chr22 27721253 27721406 +chr22 27723543 27723731 +chr22 27726072 27726218 +chr22 27814110 27814976 +chr22 27839763 27840785 +chr22 27880022 27880212 +chr22 27883487 27883603 +chr22 28015118 28015209 +chr22 28024082 28024186 +chr22 28064438 28064633 +chr22 28176743 28178163 +chr22 28187992 28188124 +chr22 28251637 28251748 +chr22 28289749 28290033 +chr22 28310641 28310700 +chr22 28329337 28329406 +chr22 28381134 28381162 +chr22 28401643 28401778 +chr22 28409528 28409629 +chr22 28415006 28415201 +chr22 28418923 28419082 +chr22 28430036 28430060 +chr22 28455431 28455730 +chr22 28513640 28513818 +chr22 28518908 28518995 +chr22 28650024 28650444 +chr22 28701822 28701953 +chr22 28775616 28775913 +chr22 28792574 28792856 +chr22 28833119 28833382 +chr22 28833852 28834020 +chr22 28834758 28834829 +chr22 28912970 28913120 +chr22 28938422 28938564 +chr22 28981491 28982242 +chr22 29081893 29082086 +chr22 29143460 29143755 +chr22 29163942 29164113 +chr22 29170703 29171432 +chr22 29174297 29174518 +chr22 29297142 29297369 +chr22 29398153 29398318 +chr22 29406314 29406548 +chr22 29501553 29501859 +chr22 29514716 29515004 +chr22 29535761 29535966 +chr22 29557419 29557729 +chr22 29566176 29566295 +chr22 29580406 29580553 +chr22 29581239 29581340 +chr22 29586474 29586767 +chr22 29617663 29617825 +chr22 29634162 29634260 +chr22 29665682 29666520 +chr22 29715967 29716040 +chr22 29718253 29718457 +chr22 29719525 29719610 +chr22 29724468 29724585 +chr22 29725438 29725856 +chr22 29830353 29830897 +chr22 29832587 29832904 +chr22 29875852 29876389 +chr22 29880939 29881125 +chr22 29921550 29921840 +chr22 29972875 29973055 +chr22 29985217 29985524 +chr22 30010195 30010275 +chr22 30061250 30062064 +chr22 30074019 30074319 +chr22 30139419 30139483 +chr22 30215968 30216128 +chr22 30218486 30218618 +chr22 30228070 30228602 +chr22 30249672 30249811 +chr22 30250107 30250329 +chr22 30261489 30261656 +chr22 30275498 30275569 +chr22 30326885 30326962 +chr22 30388467 30388727 +chr22 30426541 30426668 +chr22 30519418 30519724 +chr22 30603832 30604756 +chr22 30618339 30618650 +chr22 30621834 30621889 +chr22 30635628 30635664 +chr22 30650005 30650048 +chr22 30684209 30684324 +chr22 30692329 30693031 +chr22 30701318 30701642 +chr22 30814859 30814964 +chr22 30833889 30836255 +chr22 30840527 30840814 +chr22 30882409 30882545 +chr22 30890035 30890251 +chr22 30908477 30908554 +chr22 30938879 30939097 +chr22 30949286 30949371 +chr22 30969991 30970046 +chr22 30998146 30998676 +chr22 31015923 31016407 +chr22 31026247 31026300 +chr22 31139624 31139849 +chr22 31148061 31148545 +chr22 31174379 31174667 +chr22 31309396 31309706 +chr22 31309735 31310541 +chr22 31320793 31320991 +chr22 31421027 31421421 +chr22 31513556 31514163 +chr22 31536405 31536712 +chr22 31547159 31547189 +chr22 31551058 31551235 +chr22 31587837 31587898 +chr22 31660378 31660686 +chr22 31702215 31702520 +chr22 31721636 31721835 +chr22 31729003 31729102 +chr22 31729642 31729888 +chr22 31774204 31774498 +chr22 31819464 31819769 +chr22 31869414 31869586 +chr22 31873507 31873601 +chr22 31894208 31894493 +chr22 31951219 31951254 +chr22 31969166 31969543 +chr22 31989462 31989790 +chr22 32127012 32127492 +chr22 32162804 32162991 +chr22 32164477 32164787 +chr22 32166290 32166768 +chr22 32186803 32186939 +chr22 32213208 32213454 +chr22 32218125 32218436 +chr22 32326693 32327276 +chr22 32358668 32358742 +chr22 32424789 32425081 +chr22 32437379 32437690 +chr22 32483376 32483736 +chr22 32565535 32565688 +chr22 32586190 32586800 +chr22 32628594 32628761 +chr22 32639944 32640383 +chr22 32757104 32758743 +chr22 32820909 32821039 +chr22 32832739 32833031 +chr22 32914273 32914836 +chr22 32925863 32926151 +chr22 32953816 32953842 +chr22 32981675 32981795 +chr22 32982556 32982754 +chr22 32997166 32997260 +chr22 33055955 33056011 +chr22 33056342 33056661 +chr22 33064295 33064545 +chr22 33131273 33131312 +chr22 33235487 33235824 +chr22 33241110 33241506 +chr22 33257778 33258719 +chr22 33265495 33265721 +chr22 33272361 33272394 +chr22 33275182 33275777 +chr22 33280099 33280349 +chr22 33292748 33292941 +chr22 33324896 33325119 +chr22 33423800 33424047 +chr22 33442056 33442231 +chr22 33503244 33503406 +chr22 33550336 33551263 +chr22 33582064 33582259 +chr22 33606658 33607090 +chr22 33623748 33624056 +chr22 33698325 33698604 +chr22 33713966 33714035 +chr22 33726866 33726930 +chr22 33816689 33816762 +chr22 33818283 33818417 +chr22 33831749 33831796 +chr22 33890146 33890391 +chr22 33948751 33949070 +chr22 34033459 34033810 +chr22 34040609 34040758 +chr22 34071428 34071598 +chr22 34089724 34089746 +chr22 34092699 34093727 +chr22 34106568 34106645 +chr22 34109431 34109774 +chr22 34113104 34113314 +chr22 34149018 34149061 +chr22 34171331 34171543 +chr22 34200598 34200856 +chr22 34224859 34224940 +chr22 34251235 34255889 +chr22 34283404 34283456 +chr22 34293881 34294060 +chr22 34356468 34356706 +chr22 34397943 34398715 +chr22 34416990 34417052 +chr22 34417179 34418004 +chr22 34483080 34483156 +chr22 34490913 34490957 +chr22 34513897 34514305 +chr22 34700944 34701017 +chr22 34715060 34715206 +chr22 34737784 34738354 +chr22 34754186 34754271 +chr22 34762187 34762362 +chr22 34796484 34796554 +chr22 34799179 34799292 +chr22 34800886 34801054 +chr22 34849442 34851769 +chr22 34865917 34866124 +chr22 34872347 34872604 +chr22 34873289 34874068 +chr22 34962040 34962207 +chr22 34970312 34971067 +chr22 35019623 35019868 +chr22 35034142 35034284 +chr22 35035736 35035796 +chr22 35078231 35078260 +chr22 35079420 35079560 +chr22 35098779 35099164 +chr22 35100348 35100530 +chr22 35110163 35110247 +chr22 35128852 35128947 +chr22 35157294 35157407 +chr22 35228643 35228965 +chr22 35385318 35385425 +chr22 35390845 35390941 +chr22 35418686 35418724 +chr22 35463311 35463840 +chr22 35469270 35469367 +chr22 35586139 35586289 +chr22 35604857 35604880 +chr22 35643333 35643371 +chr22 35696052 35696400 +chr22 35706868 35707522 +chr22 35763279 35763575 +chr22 35801419 35801472 +chr22 35812489 35812511 +chr22 35820339 35820714 +chr22 35843823 35843909 +chr22 35852384 35852502 +chr22 35896276 35896315 +chr22 35917108 35917220 +chr22 35945449 35945736 +chr22 35976044 35976183 +chr22 35991902 35992144 +chr22 36008534 36008837 +chr22 36048286 36048595 +chr22 36110223 36110344 +chr22 36112554 36112708 +chr22 36113494 36113788 +chr22 36145905 36146038 +chr22 36150589 36150889 +chr22 36198632 36198797 +chr22 36253250 36253375 +chr22 36396512 36397064 +chr22 36397507 36397548 +chr22 36431929 36431962 +chr22 36456993 36457082 +chr22 36480393 36480532 +chr22 36492510 36492817 +chr22 36502202 36502266 +chr22 36518090 36518134 +chr22 36551201 36551496 +chr22 36608499 36608615 +chr22 36623767 36623835 +chr22 36643461 36643524 +chr22 36694275 36694348 +chr22 36705672 36706035 +chr22 36722368 36722427 +chr22 36762134 36762206 +chr22 36807171 36807334 +chr22 36811558 36811592 +chr22 36856787 36856858 +chr22 36863285 36863373 +chr22 36927900 36927998 +chr22 36941958 36942148 +chr22 36943615 36943798 +chr22 36963451 36963727 +chr22 36978440 36978601 +chr22 37002607 37002668 +chr22 37006725 37007239 +chr22 37019520 37019560 +chr22 37034529 37034837 +chr22 37043508 37043722 +chr22 37048854 37048891 +chr22 37069526 37069693 +chr22 37092193 37092795 +chr22 37096165 37096380 +chr22 37098671 37098726 +chr22 37119762 37119972 +chr22 37146953 37147109 +chr22 37150439 37152596 +chr22 37183495 37183640 +chr22 37206328 37206456 +chr22 37224628 37224690 +chr22 37260022 37260446 +chr22 37263116 37263789 +chr22 37324339 37324560 +chr22 37358954 37359089 +chr22 37449157 37449241 +chr22 37470125 37470335 +chr22 37471548 37471855 +chr22 37514997 37515169 +chr22 37515606 37515816 +chr22 37520821 37520985 +chr22 37613294 37613824 +chr22 37644484 37644584 +chr22 37667094 37667530 +chr22 37668168 37668768 +chr22 37679155 37679444 +chr22 37685404 37685574 +chr22 37712322 37713069 +chr22 37768824 37768970 +chr22 37843141 37843452 +chr22 37848085 37848134 +chr22 37849787 37849947 +chr22 37870717 37871147 +chr22 37884625 37884862 +chr22 37958187 37958493 +chr22 37964558 37964872 +chr22 38003577 38003874 +chr22 38005711 38005935 +chr22 38005940 38006227 +chr22 38031186 38031302 +chr22 38038729 38038796 +chr22 38075742 38075916 +chr22 38087789 38087834 +chr22 38166777 38167269 +chr22 38176686 38177127 +chr22 38214830 38214866 +chr22 38217360 38217670 +chr22 38273599 38275811 +chr22 38290858 38290955 +chr22 38317282 38317308 +chr22 38317313 38317375 +chr22 38318745 38319240 +chr22 38333185 38333309 +chr22 38349678 38350251 +chr22 38370294 38370755 +chr22 38413410 38413512 +chr22 38421979 38422397 +chr22 38516613 38516816 +chr22 38571789 38572071 +chr22 38588641 38588957 +chr22 38679410 38679671 +chr22 38692497 38692809 +chr22 38693841 38693940 +chr22 38724670 38724810 +chr22 38743304 38743608 +chr22 38775944 38776112 +chr22 38780473 38780587 +chr22 38923659 38923880 +chr22 38930358 38932613 +chr22 38945987 38946719 +chr22 38950488 38950527 +chr22 38979832 38980144 +chr22 39009807 39010435 +chr22 39127425 39127712 +chr22 39140503 39140670 +chr22 39152559 39152608 +chr22 39162210 39163191 +chr22 39238454 39238693 +chr22 39276360 39276521 +chr22 39302730 39302919 +chr22 39329629 39329715 +chr22 39382157 39382233 +chr22 39387031 39387209 +chr22 39503818 39504097 +chr22 39535649 39535696 +chr22 39538318 39538438 +chr22 39566335 39566754 +chr22 39601819 39602374 +chr22 39607321 39607494 +chr22 39682840 39683107 +chr22 39694510 39694787 +chr22 39765176 39765209 +chr22 39777392 39777413 +chr22 39818880 39819878 +chr22 39910032 39910389 +chr22 39920937 39920990 +chr22 40049234 40049323 +chr22 40154558 40154910 +chr22 40241000 40241057 +chr22 40250933 40250951 +chr22 40300926 40300948 +chr22 40342301 40342474 +chr22 40381672 40381962 +chr22 40390640 40390905 +chr22 40456252 40456339 +chr22 40466924 40467003 +chr22 40489697 40489988 +chr22 40548874 40549054 +chr22 40555640 40555938 +chr22 40564089 40564316 +chr22 40565264 40565566 +chr22 40583530 40583868 +chr22 40609014 40609308 +chr22 40675843 40676260 +chr22 40724620 40724816 +chr22 40746643 40750204 +chr22 40752400 40752713 +chr22 40763986 40764269 +chr22 40785447 40785718 +chr22 40804827 40804962 +chr22 40866690 40866724 +chr22 40907139 40907439 +chr22 40973568 40973841 +chr22 41020046 41020169 +chr22 41035441 41037057 +chr22 41043402 41043718 +chr22 41070393 41070590 +chr22 41078270 41078559 +chr22 41080666 41081164 +chr22 41112499 41112858 +chr22 41155155 41156001 +chr22 41162074 41162167 +chr22 41170653 41170962 +chr22 41194046 41194461 +chr22 41194469 41194770 +chr22 41228975 41229261 +chr22 41248977 41249073 +chr22 41263358 41263679 +chr22 41292156 41292449 +chr22 41322188 41322485 +chr22 41373767 41374984 +chr22 41381288 41381371 +chr22 41391248 41391615 +chr22 41393474 41393794 +chr22 41476126 41476739 +chr22 41501305 41501436 +chr22 41537165 41537362 +chr22 41580126 41580333 +chr22 41608948 41609235 +chr22 41609805 41610395 +chr22 41627095 41627236 +chr22 41657490 41657816 +chr22 41665380 41665575 +chr22 41681733 41681888 +chr22 41748401 41748538 +chr22 41750259 41750679 +chr22 41755786 41755863 +chr22 41845929 41846042 +chr22 41847508 41847567 +chr22 41899999 41900110 +chr22 41921609 41921908 +chr22 41929935 41930236 +chr22 41941154 41941280 +chr22 41975946 41976360 +chr22 42002215 42002718 +chr22 42005231 42006764 +chr22 42027502 42027574 +chr22 42050660 42051669 +chr22 42120358 42120616 +chr22 42179381 42179533 +chr22 42182635 42183248 +chr22 42195500 42195798 +chr22 42235913 42236205 +chr22 42266108 42266177 +chr22 42296169 42296206 +chr22 42319656 42319704 +chr22 42377967 42378281 +chr22 42382378 42382448 +chr22 42389989 42390105 +chr22 42438732 42439040 +chr22 42485857 42486092 +chr22 42546327 42546490 +chr22 42567382 42567700 +chr22 42569255 42569559 +chr22 42569839 42569874 +chr22 42639345 42639795 +chr22 42645208 42645292 +chr22 42672345 42672869 +chr22 42773961 42774256 +chr22 42781759 42782167 +chr22 42865491 42865654 +chr22 42917791 42917938 +chr22 42927250 42927548 +chr22 42936628 42936917 +chr22 42947698 42947854 +chr22 43060014 43060102 +chr22 43080498 43080794 +chr22 43104863 43105143 +chr22 43112189 43112497 +chr22 43174718 43174906 +chr22 43178323 43178620 +chr22 43231508 43231617 +chr22 43252764 43252858 +chr22 43272796 43272905 +chr22 43308785 43308971 +chr22 43332942 43333109 +chr22 43334753 43334973 +chr22 43344497 43344548 +chr22 43371049 43371149 +chr22 43386207 43386553 +chr22 43392009 43392309 +chr22 43402765 43402891 +chr22 43448222 43448427 +chr22 43454310 43454572 +chr22 43470701 43470800 +chr22 43529412 43529895 +chr22 43641333 43641632 +chr22 43658237 43658514 +chr22 43699097 43699571 +chr22 43702024 43702269 +chr22 43760548 43760644 +chr22 43795819 43796079 +chr22 43797976 43798431 +chr22 43813300 43813583 +chr22 43838443 43838810 +chr22 43881946 43882078 +chr22 43896708 43896825 +chr22 43908194 43908501 +chr22 43949325 43949651 +chr22 43991613 43992002 +chr22 44023228 44023606 +chr22 44066861 44067141 +chr22 44176095 44176625 +chr22 44178440 44178987 +chr22 44179208 44179744 +chr22 44248594 44248729 +chr22 44277244 44277448 +chr22 44303164 44303203 +chr22 44325828 44325975 +chr22 44384484 44384517 +chr22 44491717 44492163 +chr22 44533055 44533084 +chr22 44539477 44540306 +chr22 44571644 44571769 +chr22 44601639 44602423 +chr22 44611877 44611983 +chr22 44636593 44637393 +chr22 44686294 44687686 +chr22 44889303 44889418 +chr22 44952929 44952970 +chr22 44976506 44976622 +chr22 45054881 45055191 +chr22 45108668 45108956 +chr22 45119120 45119165 +chr22 45131211 45131512 +chr22 45152262 45152314 +chr22 45154888 45155129 +chr22 45177846 45178118 +chr22 45213242 45213303 +chr22 45248669 45248761 +chr22 45256746 45256883 +chr22 45294882 45295167 +chr22 45303786 45305778 +chr22 45339430 45339544 +chr22 45361330 45361638 +chr22 45395575 45395860 +chr22 45463640 45465121 +chr22 45480312 45480364 +chr22 45500780 45500917 +chr22 45649087 45649263 +chr22 45703024 45703178 +chr22 45733453 45733770 +chr22 45809446 45809586 +chr22 45821637 45821694 +chr22 45841391 45841467 +chr22 45929001 45929084 +chr22 45940599 45941224 +chr22 45943660 45943756 +chr22 46026200 46026294 +chr22 46215455 46215772 +chr22 46231040 46231520 +chr22 46278901 46279017 +chr22 46283387 46283474 +chr22 46427025 46427250 +chr22 46466255 46466411 +chr22 46466758 46467110 +chr22 46492981 46493261 +chr22 46557637 46557751 +chr22 46558680 46560161 +chr22 46598355 46598406 +chr22 46697290 46697499 +chr22 46898305 46898918 +chr22 46937132 46939802 +chr22 46939805 46940331 +chr22 46941934 46941967 +chr22 46952417 46952523 +chr22 47022222 47024461 +chr22 47079492 47080820 +chr22 47216896 47217189 +chr22 47223222 47223405 +chr22 47224226 47224925 +chr22 47229631 47229853 +chr22 47252417 47252539 +chr22 47269405 47269951 +chr22 47286845 47286920 +chr22 47291620 47291663 +chr22 47317523 47318451 +chr22 47321220 47322381 +chr22 47344533 47344589 +chr22 47402830 47403129 +chr22 47412759 47414595 +chr22 47452535 47452592 +chr22 47525652 47525813 +chr22 47527764 47528163 +chr22 47607182 47607219 +chr22 47675646 47675735 +chr22 47730617 47731226 +chr22 47911324 47911772 +chr22 48025579 48025806 +chr22 48038159 48038282 +chr22 48041169 48041969 +chr22 48047912 48047952 +chr22 48053259 48053598 +chr22 48090884 48091363 +chr22 48120964 48121391 +chr22 48139447 48139638 +chr22 48287924 48288237 +chr22 48328541 48328673 +chr22 48347640 48347780 +chr22 48468978 48469282 +chr22 48538074 48538235 +chr22 48549675 48549711 +chr22 48599737 48599843 +chr22 48654227 48654246 +chr22 48686596 48687483 +chr22 48743750 48743850 +chr22 48928044 48928188 +chr22 48932485 48936541 +chr22 48947287 48947606 +chr22 49028815 49028862 +chr22 49041709 49042030 +chr22 49082496 49082798 +chr22 49266667 49266881 +chr22 49359996 49360197 +chr22 49394192 49394228 +chr22 49440579 49441166 +chr22 49452131 49452334 +chr22 49480362 49480405 +chr22 49505723 49505924 +chr22 49511339 49511715 +chr22 49559138 49559246 +chr22 49569827 49569877 +chr22 49572088 49572126 +chr22 49657329 49657700 +chr22 49759131 49759277 +chr22 49857597 49858201 +chr22 49881398 49881666 +chr22 49915669 49915774 +chr22 49981006 49982892 +chr22 49987648 49988083 +chr22 50074671 50075098 +chr22 50092348 50092651 +chr22 50101513 50101811 +chr22 50214147 50214325 +chr22 50273212 50274222 +chr22 50357359 50357474 +chr22 50386127 50386487 +chr22 50395487 50396695 +chr22 50403486 50403659 +chr22 50501361 50501661 +chr22 50624075 50624449 +chr22 50641027 50641222 +chr22 50675373 50675908 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.SegDup.chr22.bed b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.SegDup.chr22.bed new file mode 100644 index 00000000000..34dae449f4e --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.SegDup.chr22.bed @@ -0,0 +1,229 @@ +chr22 10510000 10716866 +chr22 10717194 10783858 +chr22 10834643 10874572 +chr22 10924572 10966724 +chr22 11016724 11068956 +chr22 11118987 11160921 +chr22 11236613 11375636 +chr22 11428056 11497337 +chr22 11550626 11631288 +chr22 11681288 11724629 +chr22 11774629 11977555 +chr22 12028037 12126656 +chr22 12126660 12225588 +chr22 12275588 12375376 +chr22 12489121 12641730 +chr22 12691730 12693537 +chr22 12694103 12726204 +chr22 12776204 12818137 +chr22 12868137 12904788 +chr22 15200000 15934152 +chr22 16027648 16029124 +chr22 16049831 16268393 +chr22 16302843 16304296 +chr22 16305427 16307046 +chr22 16378028 16568459 +chr22 16568499 16582793 +chr22 16583474 16608423 +chr22 16608436 16610320 +chr22 16610337 16680489 +chr22 16680506 16766359 +chr22 16838852 16843928 +chr22 16850819 16912063 +chr22 16922834 16928158 +chr22 17014088 17038567 +chr22 17214090 17215248 +chr22 17419415 17420675 +chr22 17763709 17765328 +chr22 17984579 17998239 +chr22 18008719 18010388 +chr22 18159723 18239129 +chr22 18339129 18433513 +chr22 18483513 18518714 +chr22 18518836 18659561 +chr22 18709564 18939750 +chr22 18950240 18976730 +chr22 18977700 18981974 +chr22 18999690 19010480 +chr22 19010632 19015209 +chr22 19023255 19035499 +chr22 20141036 20166817 +chr22 20262035 20290267 +chr22 20299686 20301294 +chr22 20301634 20323554 +chr22 20324592 20362643 +chr22 20362648 20377695 +chr22 20388431 20390146 +chr22 20451509 20453240 +chr22 20482745 20484148 +chr22 20667580 20671339 +chr22 20672650 20674606 +chr22 20678347 20699044 +chr22 20700323 20720038 +chr22 20731437 20738049 +chr22 21010365 21018352 +chr22 21042050 21050489 +chr22 21057379 21062735 +chr22 21111384 21151083 +chr22 21151091 21443089 +chr22 21458187 21562827 +chr22 21954496 21958494 +chr22 22114615 22118379 +chr22 22118384 22120610 +chr22 22220568 22224566 +chr22 22249758 22315122 +chr22 22320797 22327981 +chr22 22402552 22410447 +chr22 22620608 22655111 +chr22 22664938 22668785 +chr22 22670912 22699474 +chr22 22699496 22703424 +chr22 22704161 22717672 +chr22 22732715 22761861 +chr22 22761911 22776923 +chr22 22789775 22794414 +chr22 22820054 22823467 +chr22 22897363 22901536 +chr22 22902711 22906902 +chr22 23263953 23264988 +chr22 23306925 23321261 +chr22 23322143 23351708 +chr22 23351727 23357279 +chr22 23358158 23378265 +chr22 23378551 23380944 +chr22 23381237 23402335 +chr22 23402754 23405250 +chr22 23415788 23427393 +chr22 23432025 23461151 +chr22 23464357 23480727 +chr22 23480999 23489014 +chr22 23493585 23499842 +chr22 23499856 23510388 +chr22 23511082 23552138 +chr22 23553526 23577255 +chr22 23594547 23602870 +chr22 23616462 23620198 +chr22 23620202 23631127 +chr22 23631491 23644932 +chr22 23658595 23661000 +chr22 23670225 23679116 +chr22 23734856 23737374 +chr22 23853822 23856326 +chr22 23929519 23932056 +chr22 23939932 23969548 +chr22 23971773 24001392 +chr22 24179310 24184938 +chr22 24236854 24268803 +chr22 24268948 24283958 +chr22 24286747 24301118 +chr22 24310536 24314778 +chr22 24377585 24389259 +chr22 24598467 24684046 +chr22 24686848 24690592 +chr22 24690598 24693136 +chr22 24701278 24703178 +chr22 24764774 24766052 +chr22 25177661 25179556 +chr22 25226889 25247092 +chr22 25253643 25258280 +chr22 25258590 25284823 +chr22 25284953 25313517 +chr22 25314206 25328057 +chr22 25330858 25340072 +chr22 25344989 25348911 +chr22 25350903 25382596 +chr22 25385378 25414514 +chr22 25415392 25426013 +chr22 25426105 25427123 +chr22 25428803 25435365 +chr22 25454748 25471797 +chr22 25502468 25507013 +chr22 25515602 25532252 +chr22 25559573 25569251 +chr22 25575611 25576630 +chr22 25580799 25617150 +chr22 25649024 25652032 +chr22 26560427 26561624 +chr22 28675662 28696392 +chr22 29355386 29357923 +chr22 29360314 29368998 +chr22 29399343 29409561 +chr22 29410818 29411866 +chr22 29411971 29424893 +chr22 29434688 29443954 +chr22 29488845 29491290 +chr22 29608860 29609951 +chr22 29880407 29902476 +chr22 30543830 30544914 +chr22 30655294 30657152 +chr22 30902152 30904579 +chr22 31057439 31062134 +chr22 31065390 31078993 +chr22 31204482 31207013 +chr22 32111378 32113026 +chr22 32130965 32141884 +chr22 32148444 32149492 +chr22 32150491 32158285 +chr22 32159300 32162703 +chr22 32170391 32172935 +chr22 32175877 32184749 +chr22 32187175 32197389 +chr22 32271774 32273163 +chr22 32273176 32279646 +chr22 32279657 32329510 +chr22 32331229 32341624 +chr22 32341631 32344607 +chr22 32344611 32346315 +chr22 32357195 32363983 +chr22 36127120 36129313 +chr22 36534514 36537489 +chr22 38355621 38401450 +chr22 38957757 38964074 +chr22 38982097 38988028 +chr22 38988116 38993669 +chr22 39020021 39066093 +chr22 39069234 39080941 +chr22 39520561 39522683 +chr22 39875503 39876516 +chr22 40824594 40827232 +chr22 41073572 41075248 +chr22 41433493 41434823 +chr22 41708403 41709587 +chr22 41901063 41902969 +chr22 42000444 42002033 +chr22 42045815 42047270 +chr22 42123191 42132193 +chr22 42135343 42145873 +chr22 42149884 42155241 +chr22 42446192 42450718 +chr22 42479610 42489336 +chr22 42500173 42520238 +chr22 42523089 42528444 +chr22 42529035 42536680 +chr22 42553733 42559454 +chr22 42566420 42582450 +chr22 42775905 42777359 +chr22 43032309 43040224 +chr22 43166623 43168160 +chr22 43644840 43645853 +chr22 44112740 44114031 +chr22 44114040 44115404 +chr22 44222726 44266377 +chr22 44266598 44277214 +chr22 44566935 44568316 +chr22 44858020 44860015 +chr22 44865612 44867631 +chr22 45184450 45188011 +chr22 46139130 46141577 +chr22 46479626 46481736 +chr22 46615032 46616157 +chr22 46616221 46617437 +chr22 48618772 48620136 +chr22 48620180 48621405 +chr22 48901510 48903279 +chr22 48910475 48911620 +chr22 48911634 48912886 +chr22 49383944 49385910 +chr22 49386637 49388496 +chr22 50432257 50442552 +chr22 50740515 50808468 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config.tsv new file mode 100644 index 00000000000..6ca199c9600 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config.tsv @@ -0,0 +1,7 @@ +NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS +INS_small_SD INS -1 -1 SD +DEL_50_5k_both DEL 50 5000 SD,RM +DEL_5k_50k_SD DEL 5000 50000 SD +DUP_lt5kb_RM DUP 5000 RM +INV_gt1kb INV 1000 -1 NULL +BND_SD BND -1 -1 SD diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_duplicate.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_duplicate.tsv new file mode 100644 index 00000000000..417167668b7 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_duplicate.tsv @@ -0,0 +1,8 @@ +NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS +INS_small_SD INS -1 -1 SD +DEL_50_5k_dup DEL 50 5000 SD +DEL_50_5k_dup DEL 50 5000 RM +DEL_5k_50k_SD DEL 5000 50000 SD +DUP_lt5kb_RM DUP 5000 RM +INV_gt1kb INV 1000 -1 NULL +BND_SD BND -1 -1 SD diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_redundant.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_redundant.tsv new file mode 100644 index 00000000000..990df0d9458 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_redundant.tsv @@ -0,0 +1,8 @@ +NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS +INS_small_SD INS -1 -1 SD +DEL_50_5k_SD DEL 50 5000 SD +DEL_50_5k_RM DEL 50 5000 RM +DEL_5k_50k_SD DEL 5000 50000 SD +DUP_lt5kb_RM DUP 5000 RM +INV_gt1kb INV 1000 -1 NULL +BND_SD BND -1 -1 SD