Skip to content

Commit

Permalink
SamLocusAndReferenceIterator fails with leading indels at the start o…
Browse files Browse the repository at this point in the history
…f the reference (#1481)

* test case to show SamLocusAndReferenceIterator fails with leading indels
at the start of the reference
* candidate fix
  • Loading branch information
nh13 authored May 21, 2020
1 parent bd1b739 commit 709c9f8
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
*/
public class SamLocusAndReferenceIterator extends IterableOnceIterator<SamLocusAndReferenceIterator.SAMLocusAndReference> {

/** The base to use to indicate the locus is prior to the reference start (i.e. position zero). */
final static byte BASE_BEFORE_REFERENCE_START = (byte) '-';

private final ReferenceSequenceFileWalker referenceSequenceFileWalker;
private final SamLocusIterator locusIterator;

Expand Down Expand Up @@ -78,8 +81,15 @@ public SAMLocusAndReference next() {
final ReferenceSequence referenceSequence = referenceSequenceFileWalker.get(locus.getSequenceIndex(), locus.getSequenceName(),
locus.getSequenceLength());

//position is 1-based...arrays are 0-based!
return new SAMLocusAndReference(locus, referenceSequence.getBases()[locus.getPosition() - 1]);
// Developer notes:
// 1. position is 1-based...arrays are 0-based!
// 2. We must guard against insertions before the reference
if (locus.getPosition() == 0) {
return new SAMLocusAndReference(locus, BASE_BEFORE_REFERENCE_START);
}
else {
return new SAMLocusAndReference(locus, referenceSequence.getBases()[locus.getPosition() - 1]);
}
}

/** Small class to hold together
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,54 @@ public void testSamLocusAndReferenceIteratorMismatch() {
final SamLocusIterator samLocusIterator = new SamLocusIterator(samReader);
new SamLocusAndReferenceIterator(referenceSequenceFileWalker, samLocusIterator); // should throw
}

@Test
public void testSamLocusAndReferenceIteratorLeadingInsertion() {
final File reference = new File(TEST_DATA_DIR, "Homo_sapiens_assembly18.trimmed.fasta");
final File samFile = new File(TEST_DATA_DIR, "leading_insertion.sam");
final ReferenceSequenceFile referenceSequenceFile = new FastaSequenceFile(reference, false);
final ReferenceSequenceFileWalker referenceSequenceFileWalker = new ReferenceSequenceFileWalker(referenceSequenceFile);

final SamReader samReader = SamReaderFactory.makeDefault().open(samFile);
final SamLocusIterator samLocusIterator = new SamLocusIterator(samReader);
samLocusIterator.setIncludeIndels(true);
final SamLocusAndReferenceIterator samLocusAndReferences = new SamLocusAndReferenceIterator(referenceSequenceFileWalker, samLocusIterator);

IntervalList intervalList = new IntervalList(samReader.getFileHeader());
intervalList.add(new Interval("chrM", 1, 26));
intervalList.add(new Interval("chrM", 16546, 16571));

OverlapDetector<Interval> overlapDetector = new OverlapDetector<>(0, 0);
overlapDetector.addAll(intervalList.getIntervals(), intervalList.getIntervals());

// Developer note: these are the # of loci that we have (1) a read base mapped to a reference base, (2) a read
// base inserted relative the reference, and (3) a read base deleted relative to the reference. For read bases
// inserted prior to or after the reference (ex.. at position 1 of a contig), they are counted only once!
int totalMappedRecords = 0;
int totalInsertedRecords = 0;
int totalDeletedRecords = 0;
byte positionAtZeroBase = (byte) '?';
for (final SamLocusAndReferenceIterator.SAMLocusAndReference samLocusAndReference : samLocusAndReferences) {
final SamLocusIterator.LocusInfo locus = samLocusAndReference.getLocus();
if (locus.getContig() == "chrM" && locus.getPosition() == 0) {
positionAtZeroBase = samLocusAndReference.getReferenceBase();
}

totalMappedRecords += samLocusAndReference.getRecordAndOffsets().size();
totalInsertedRecords += samLocusAndReference.getLocus().getInsertedInRecord().size();
totalDeletedRecords += samLocusAndReference.getLocus().getDeletedInRecord().size();

if (overlapDetector.overlapsAny(samLocusAndReference.getLocus())) {
Assert.assertEquals(samLocusAndReference.getRecordAndOffsets().size(), 2);
}
else {
Assert.assertEquals(samLocusAndReference.getRecordAndOffsets().size(), 0);
}
}
Assert.assertEquals(positionAtZeroBase, SamLocusAndReferenceIterator.BASE_BEFORE_REFERENCE_START);
Assert.assertEquals(totalMappedRecords, (36 - 10) * 4);
// Developer note: not 10 * 4, since we count all inserted bases prior to or after the reference only once
Assert.assertEquals(totalInsertedRecords, 4);
Assert.assertEquals(totalDeletedRecords, 0);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@HD VN:1.0 SO:coordinate
@SQ SN:chrM LN:16571 UR:file:/Users/mhanna/src/Sting/Homo_sapiens_assembly18.trimmed.fasta M5:d2ed829b8a1628d16cbeee88e88e39eb
@SQ SN:chr20 LN:1000000 UR:file:/Users/mhanna/src/Sting/Homo_sapiens_assembly18.trimmed.fasta M5:b4eac854d70893986ac578c53c2324f1
@RG ID:62A40.2 PL:illumina PU:62A40AAXX101028.2 LB:Solexa-45345 DT:2010-10-28T00:00:00-0400 SM:RRBS885 CN:BI
1 99 chrM 1 60 10I26M chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC AAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBB RG:Z:62A40.2
1 147 chrM 1 60 10I26M chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC BBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:62A40.2
2 99 chrM 16546 60 26M10I chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC AAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBB RG:Z:62A40.2
2 147 chrM 16546 60 26M10I chrM 1 0 ACATCACGATGATCACAGGTCTATCACCCTATTAAC BBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:62A40.2

0 comments on commit 709c9f8

Please sign in to comment.