From 709c9f8403e454af0d6a9ad460c409beba3ad441 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Thu, 21 May 2020 15:48:09 -0700 Subject: [PATCH] SamLocusAndReferenceIterator fails with leading indels at the start of the reference (#1481) * test case to show SamLocusAndReferenceIterator fails with leading indels at the start of the reference * candidate fix --- .../SamLocusAndReferenceIterator.java | 14 +++++- .../SamLocusAndReferenceIteratorTest.java | 50 +++++++++++++++++++ .../samtools/reference/leading_insertion.sam | 8 +++ 3 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 src/test/resources/htsjdk/samtools/reference/leading_insertion.sam diff --git a/src/main/java/htsjdk/samtools/reference/SamLocusAndReferenceIterator.java b/src/main/java/htsjdk/samtools/reference/SamLocusAndReferenceIterator.java index ec8d0e01b5..bae8641571 100644 --- a/src/main/java/htsjdk/samtools/reference/SamLocusAndReferenceIterator.java +++ b/src/main/java/htsjdk/samtools/reference/SamLocusAndReferenceIterator.java @@ -41,6 +41,9 @@ */ public class SamLocusAndReferenceIterator extends IterableOnceIterator { + /** The base to use to indicate the locus is prior to the reference start (i.e. position zero). */ + final static byte BASE_BEFORE_REFERENCE_START = (byte) '-'; + private final ReferenceSequenceFileWalker referenceSequenceFileWalker; private final SamLocusIterator locusIterator; @@ -78,8 +81,15 @@ public SAMLocusAndReference next() { final ReferenceSequence referenceSequence = referenceSequenceFileWalker.get(locus.getSequenceIndex(), locus.getSequenceName(), locus.getSequenceLength()); - //position is 1-based...arrays are 0-based! - return new SAMLocusAndReference(locus, referenceSequence.getBases()[locus.getPosition() - 1]); + // Developer notes: + // 1. position is 1-based...arrays are 0-based! + // 2. We must guard against insertions before the reference + if (locus.getPosition() == 0) { + return new SAMLocusAndReference(locus, BASE_BEFORE_REFERENCE_START); + } + else { + return new SAMLocusAndReference(locus, referenceSequence.getBases()[locus.getPosition() - 1]); + } } /** Small class to hold together diff --git a/src/test/java/htsjdk/samtools/reference/SamLocusAndReferenceIteratorTest.java b/src/test/java/htsjdk/samtools/reference/SamLocusAndReferenceIteratorTest.java index c2f98036cd..cf4362c6ad 100644 --- a/src/test/java/htsjdk/samtools/reference/SamLocusAndReferenceIteratorTest.java +++ b/src/test/java/htsjdk/samtools/reference/SamLocusAndReferenceIteratorTest.java @@ -53,4 +53,54 @@ public void testSamLocusAndReferenceIteratorMismatch() { final SamLocusIterator samLocusIterator = new SamLocusIterator(samReader); new SamLocusAndReferenceIterator(referenceSequenceFileWalker, samLocusIterator); // should throw } + + @Test + public void testSamLocusAndReferenceIteratorLeadingInsertion() { + final File reference = new File(TEST_DATA_DIR, "Homo_sapiens_assembly18.trimmed.fasta"); + final File samFile = new File(TEST_DATA_DIR, "leading_insertion.sam"); + final ReferenceSequenceFile referenceSequenceFile = new FastaSequenceFile(reference, false); + final ReferenceSequenceFileWalker referenceSequenceFileWalker = new ReferenceSequenceFileWalker(referenceSequenceFile); + + final SamReader samReader = SamReaderFactory.makeDefault().open(samFile); + final SamLocusIterator samLocusIterator = new SamLocusIterator(samReader); + samLocusIterator.setIncludeIndels(true); + final SamLocusAndReferenceIterator samLocusAndReferences = new SamLocusAndReferenceIterator(referenceSequenceFileWalker, samLocusIterator); + + IntervalList intervalList = new IntervalList(samReader.getFileHeader()); + intervalList.add(new Interval("chrM", 1, 26)); + intervalList.add(new Interval("chrM", 16546, 16571)); + + OverlapDetector overlapDetector = new OverlapDetector<>(0, 0); + overlapDetector.addAll(intervalList.getIntervals(), intervalList.getIntervals()); + + // Developer note: these are the # of loci that we have (1) a read base mapped to a reference base, (2) a read + // base inserted relative the reference, and (3) a read base deleted relative to the reference. For read bases + // inserted prior to or after the reference (ex.. at position 1 of a contig), they are counted only once! + int totalMappedRecords = 0; + int totalInsertedRecords = 0; + int totalDeletedRecords = 0; + byte positionAtZeroBase = (byte) '?'; + for (final SamLocusAndReferenceIterator.SAMLocusAndReference samLocusAndReference : samLocusAndReferences) { + final SamLocusIterator.LocusInfo locus = samLocusAndReference.getLocus(); + if (locus.getContig() == "chrM" && locus.getPosition() == 0) { + positionAtZeroBase = samLocusAndReference.getReferenceBase(); + } + + totalMappedRecords += samLocusAndReference.getRecordAndOffsets().size(); + totalInsertedRecords += samLocusAndReference.getLocus().getInsertedInRecord().size(); + totalDeletedRecords += samLocusAndReference.getLocus().getDeletedInRecord().size(); + + if (overlapDetector.overlapsAny(samLocusAndReference.getLocus())) { + Assert.assertEquals(samLocusAndReference.getRecordAndOffsets().size(), 2); + } + else { + Assert.assertEquals(samLocusAndReference.getRecordAndOffsets().size(), 0); + } + } + Assert.assertEquals(positionAtZeroBase, SamLocusAndReferenceIterator.BASE_BEFORE_REFERENCE_START); + Assert.assertEquals(totalMappedRecords, (36 - 10) * 4); + // Developer note: not 10 * 4, since we count all inserted bases prior to or after the reference only once + Assert.assertEquals(totalInsertedRecords, 4); + Assert.assertEquals(totalDeletedRecords, 0); + } } diff --git a/src/test/resources/htsjdk/samtools/reference/leading_insertion.sam b/src/test/resources/htsjdk/samtools/reference/leading_insertion.sam new file mode 100644 index 0000000000..17e3c6de0d --- /dev/null +++ b/src/test/resources/htsjdk/samtools/reference/leading_insertion.sam @@ -0,0 +1,8 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chrM LN:16571 UR:file:/Users/mhanna/src/Sting/Homo_sapiens_assembly18.trimmed.fasta M5:d2ed829b8a1628d16cbeee88e88e39eb +@SQ SN:chr20 LN:1000000 UR:file:/Users/mhanna/src/Sting/Homo_sapiens_assembly18.trimmed.fasta M5:b4eac854d70893986ac578c53c2324f1 +@RG ID:62A40.2 PL:illumina PU:62A40AAXX101028.2 LB:Solexa-45345 DT:2010-10-28T00:00:00-0400 SM:RRBS885 CN:BI +1 99 chrM 1 60 10I26M chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC AAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBB RG:Z:62A40.2 +1 147 chrM 1 60 10I26M chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC BBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:62A40.2 +2 99 chrM 16546 60 26M10I chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC AAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBB RG:Z:62A40.2 +2 147 chrM 16546 60 26M10I chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC BBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:62A40.2 \ No newline at end of file