diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/Pipelines.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/Pipelines.java index 7497df6..855c22b 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/Pipelines.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/Pipelines.java @@ -686,7 +686,7 @@ public void reflexivDSDynamicAssemblyStepsPipe() throws IOException{ if (checkOutputFile(param.outputPath + "/Assembly_intermediate/01Iteration" + param.startIteration + "_" + param.endIteration)){ info.readMessage("Removing: " + param.inputKmerPath.substring(0,param.inputKmerPath.length()-6)); info.screenDump(); - cleanDiskStorage(param.inputKmerPath.substring(0,param.inputKmerPath.length()-6)); + // cleanDiskStorage(param.inputKmerPath.substring(0,param.inputKmerPath.length()-6)); }else{ info.readMessage("Failed " + param.startIteration + " -> " + param.endIteration + " iterations : "); info.screenDump(); @@ -1084,7 +1084,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing : Count_" + param.kmerSize1); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1); } else { info.readMessage("Failed k-mer sorting : " + param.kmerSize1 + " failed:"); info.screenDump(); @@ -1098,7 +1098,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing : Count_" + param.kmerSize1); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1); } if (!checkOutputFile(param.outputPath + "/Count_" + param.kmerSize2 + "_sorted")) { @@ -1156,7 +1156,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing : Count_" + param.kmerSize2); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2); } else { info.readMessage("Failed k-mer sorting : " + param.kmerSize2 + " failed:"); info.screenDump(); @@ -1170,7 +1170,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing : Count_" + param.kmerSize2); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2); } param.inputKmerPath1 = param.outputPath + "/Count_" + param.kmerSize1 + "_sorted/part*.csv.gz"; @@ -1187,7 +1187,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing: Count_" + param.kmerSize1 + "_sorted"); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1 + "_sorted"); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1 + "_sorted"); } else { info.readMessage("Failed k-mer reduction : " + param.kmerSize2 + " vs " + param.kmerSize1 + " failed:"); info.screenDump(); @@ -1202,7 +1202,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { // if (param.kmerSize2 <100) { info.readMessage("Removing: Count_" + param.kmerSize2 + "_sorted"); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2 + "_sorted"); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2 + "_sorted"); /* }else { info.readMessage("Rename last k-mer sorted to k-mer reduced"); info.screenDump(); @@ -1216,9 +1216,9 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing: Count_" + param.kmerSize1 + "_sorted, Count_" + param.kmerSize1 + ", and Count_" + param.kmerSize2); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1 + "_sorted"); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1 + "_sorted"); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize1); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerSize2); } } @@ -1279,7 +1279,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing : Count_" + param.kmerListInt[param.kmerListInt.length - 1]); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerListInt[param.kmerListInt.length - 1]); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerListInt[param.kmerListInt.length - 1]); } else { info.readMessage("Failed k-mer sorting : " + param.kmerListInt[param.kmerListInt.length - 1] + " failed:"); info.screenDump(); @@ -1292,7 +1292,7 @@ public void reflexivDSDynamicReductionPipe() throws IOException { info.readMessage("Removing : Count_" + param.kmerListInt[param.kmerListInt.length-1] ); info.screenDump(); - cleanDiskStorage(param.outputPath + "/Count_" + param.kmerListInt[param.kmerListInt.length-1] ); +// cleanDiskStorage(param.outputPath + "/Count_" + param.kmerListInt[param.kmerListInt.length-1] ); info.readMessage("This is the last k-mer reduction round"); info.screenDump(); diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFirstFour.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFirstFour.java index 33206b2..fb80121 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFirstFour.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFirstFour.java @@ -200,9 +200,12 @@ public void assemblyFromKmer() { iterations++; ReflexivSubKmerDS = ReflexivSubKmerDS.sort("k-1"); ReflexivSubKmerDS = ReflexivSubKmerDS.mapPartitions(DSKmerExtention, ReflexivSubKmerEncoderCompressed); + + // long kmerNumber = ReflexivSubKmerDS.count(); + // System.out.println("kmer count after extension " + iterations + " is " + kmerNumber); } - // ReflexivSubKmerDS.persist(StorageLevel.MEMORY_AND_DISK()); + // ReflexivSubKmerDS.persist(StorageLevel.MEMORY_AND_DISK()); DSBinarySubKmerWithShortExtensionToString SubKmerToString = new DSBinarySubKmerWithShortExtensionToString(); ReflexivLongSubKmerStringDS = ReflexivSubKmerDS.mapPartitions(SubKmerToString, ReflexivLongKmerStringEncoder); @@ -211,7 +214,7 @@ public void assemblyFromKmer() { ReflexivLongSubKmerStringDS.write(). mode(SaveMode.Overwrite). format("csv"). - option("compression", "lz4"). + option("compression", "gzip"). save(param.outputPath + "/Assembly_intermediate/00firstFour"); @@ -1750,6 +1753,9 @@ public Iterator call(Iterator sIterator) throws Exception { lineMarker++; // return reflexivKmerConcatList.iterator(); } + + // System.out.println("tempReflexivkmer size: " + tmpReflexivKmerExtendList.size()); + // System.out.println("reflexivKmerConcatList size: " + reflexivKmerConcatList.size()); } // while loop tmpKmerRandomizer(); diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java index 51228e1..c97695b 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java @@ -538,7 +538,7 @@ public Iterator call(Iterator s) { nucleotideBinarySlot[param.kmerListHash.get(currentKmerSize)-1] |= kmerEndMark; // param.kmerListHash.get(currentKmerSize)] == currentKmerBlockSize // long[] RCnucleotideBinarySlot = binaryBlockReverseComplementary(nucleotideBinarySlot); - System.out.println("kmer Binary: " + BinaryBlocksToString(nucleotideBinarySlot)); + // System.out.println("kmer Binary: " + BinaryBlocksToString(nucleotideBinarySlot)); // System.out.println("kmer Binary RC: " + BinaryBlocksToString(RCnucleotideBinarySlot)); // return kmerList.add( @@ -715,7 +715,7 @@ public Iterator call(Iterator s) { kmerEndMark <<= 2*(32-1-((currentExtensionSize-1)%31+1)); extensionBinarySlot[currentExtensionBlockSize-1] |= kmerEndMark; // param.kmerListHash.get(currentKmerSize)] == currentKmerBlockSize - System.out.println("kmer Binarized: " + BinaryBlocksToString(extensionBinarySlot)); + // System.out.println("kmer Binarized: " + BinaryBlocksToString(extensionBinarySlot)); // attribute= onlyChangeReflexivMarker(attribute,1); kmerList.add( @@ -951,9 +951,9 @@ public Iterator call(Iterator sIterator) throws Exception{ // the leftover of lastMarkerArray will not find a read anymore - for (int i=0; i 11---------- -> 0011111111111 leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker - System.out.println("getLeftMarker before: " + leftMarker); + // System.out.println("getLeftMarker before: " + leftMarker); if (leftMarker>30000){ leftMarker=30000-leftMarker; @@ -1243,10 +1243,10 @@ public Iterator call(Iterator sIterator) throws Exception{ indices.add((int)s.getLong(1)); }else{ if (s.getLong(0) == lastKmer.getLong(0)){ - System.out.println("sorted read ID: " + s.getLong(0) + " index " + s.getLong(1)); + // System.out.println("sorted read ID: " + s.getLong(0) + " index " + s.getLong(1)); indices.add((int)s.getLong(1)); }else{ - System.out.println("sorted current different from last, current: " + s.getLong(0) + " index " + s.getLong(1) + " last: " + lastKmer.getLong(0) + " index "+ lastKmer.getLong(1)); + // System.out.println("sorted current different from last, current: " + s.getLong(0) + " index " + s.getLong(1) + " last: " + lastKmer.getLong(0) + " index "+ lastKmer.getLong(1)); indices.add((int)lastKmer.getLong(1)); // add the last one if (indices.size()>1) { // more than one match @@ -1276,10 +1276,10 @@ public Iterator call(Iterator sIterator) throws Exception{ } } - for (int i =0; i< ReadsAndRange.size(); i++){ - System.out.println("Read ID: " + ReadsAndRange.get(i).getLong(0) + " first range: " + getLeftMarker( ((long[]) ReadsAndRange.get(i).get(1))[0] ) + " to " + getRightMarker( ((long[]) ReadsAndRange.get(i).get(1))[0]) ); + // for (int i =0; i< ReadsAndRange.size(); i++){ + // System.out.println("Read ID: " + ReadsAndRange.get(i).getLong(0) + " first range: " + getLeftMarker( ((long[]) ReadsAndRange.get(i).get(1))[0] ) + " to " + getRightMarker( ((long[]) ReadsAndRange.get(i).get(1))[0]) ); - } + // } indices.add((int)lastKmer.getLong(1)) ; // add the last one if (indices.size()>1){ @@ -1298,9 +1298,9 @@ public Iterator call(Iterator sIterator) throws Exception{ } } - if (ReadsAndRange.size()>0) { - System.out.println("Read ID last: " + ReadsAndRange.get(ReadsAndRange.size() - 1).getLong(0) + " first range: " + getLeftMarker(((long[]) ReadsAndRange.get(ReadsAndRange.size() - 1).get(1))[0]) + " to " + getRightMarker(((long[]) ReadsAndRange.get(ReadsAndRange.size() - 1).get(1))[0])); - } + // if (ReadsAndRange.size()>0) { + // System.out.println("Read ID last: " + ReadsAndRange.get(ReadsAndRange.size() - 1).getLong(0) + " first range: " + getLeftMarker(((long[]) ReadsAndRange.get(ReadsAndRange.size() - 1).get(1))[0]) + " to " + getRightMarker(((long[]) ReadsAndRange.get(ReadsAndRange.size() - 1).get(1))[0])); + // } return ReadsAndRange.iterator(); } @@ -1319,12 +1319,12 @@ private long[] findRange(List i, long index){ int a = 0; int b = 0; for (int j =1 ; j 1){ // at least one k-mer gap a= lastIndex; b = i.get(j); - System.out.println("a gap: " + index + " from " + a + " to " + i.get(j)); + // System.out.println("a gap: " + index + " from " + a + " to " + i.get(j)); /** * a-1 means that the gap starts from the next k-mer @@ -1410,7 +1410,7 @@ public Iterator call(Iterator sIterator) throws Exception{ while (sIterator.hasNext()){ Row s = sIterator.next(); - System.out.println("Findmatch new entry: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch new entry: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); // ------- k-mer // ------- read // ------- read @@ -1419,13 +1419,13 @@ public Iterator call(Iterator sIterator) throws Exception{ if (ReadsKmerBuffer.size()>0){ for (int i =0; i< ReadsKmerBuffer.size();i++){ if (dynamicSubKmerComparator(ReadsKmerBuffer.get(i).getSeq(0), s.getSeq(0)) == true){ - System.out.println("Findmatch buffered difference compared: " + BinaryBlocksToString( seq2array(ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getSeq(0)) ) + " ID " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(1) + " index/mark " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(2)); - System.out.println("Findmatch buffered to current compared: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch buffered difference compared: " + BinaryBlocksToString( seq2array(ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getSeq(0)) ) + " ID " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(1) + " index/mark " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(2)); + // System.out.println("Findmatch buffered to current compared: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); ReadsAndIndices.add( RowFactory.create(ReadsKmerBuffer.get(i).getLong(1), ReadsKmerBuffer.get(i).getLong(2)) ); }else{ - System.out.println("Findmatch buffered did not match current: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch buffered did not match current: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); } } @@ -1433,12 +1433,12 @@ public Iterator call(Iterator sIterator) throws Exception{ } lastKmer = s; - System.out.println("Findmatch lastKmer: " + BinaryBlocksToString( seq2array(lastKmer.getSeq(0)) ) + " ID " + lastKmer.getLong(1) + " index/mark " + lastKmer.getLong(2)); + // System.out.println("Findmatch lastKmer: " + BinaryBlocksToString( seq2array(lastKmer.getSeq(0)) ) + " ID " + lastKmer.getLong(1) + " index/mark " + lastKmer.getLong(2)); }else{ // a read k-mer if (lastKmer !=null){ if (dynamicSubKmerComparator(lastKmer.getSeq(0), s.getSeq(0)) == true){ - System.out.println("Findmatch lastKmer compared: " + BinaryBlocksToString( seq2array(lastKmer.getSeq(0)) ) + " ID " + lastKmer.getLong(1) + " index/mark " + lastKmer.getLong(2)); - System.out.println("Findmatch current to last directly compared: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch lastKmer compared: " + BinaryBlocksToString( seq2array(lastKmer.getSeq(0)) ) + " ID " + lastKmer.getLong(1) + " index/mark " + lastKmer.getLong(2)); + // System.out.println("Findmatch current to last directly compared: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); ReadsAndIndices.add( RowFactory.create(s.getLong(1), s.getLong(2)) ); @@ -1449,30 +1449,30 @@ public Iterator call(Iterator sIterator) throws Exception{ // ----- if (ReadsKmerBuffer.size()>0) { if (dynamicSubKmerComparator(ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getSeq(0), s.getSeq(0)) == true) { - System.out.println("Findmatch buffered difference compared: " + BinaryBlocksToString( seq2array(ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getSeq(0)) ) + " ID " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(1) + " index/mark " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(2)); - System.out.println("Findmatch current to buffered compared: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch buffered difference compared: " + BinaryBlocksToString( seq2array(ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getSeq(0)) ) + " ID " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(1) + " index/mark " + ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getLong(2)); + // System.out.println("Findmatch current to buffered compared: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); ReadsKmerBuffer.add(s); } else { - System.out.println("Findmatch read k-mer did not match lastKmer :" + BinaryBlocksToString(seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch read k-mer did not match lastKmer :" + BinaryBlocksToString(seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); ReadsKmerBuffer = new ArrayList(); ReadsKmerBuffer.add(s); } }else{ - System.out.println("Findmatch read k-mer buffer empty :" + BinaryBlocksToString(seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch read k-mer buffer empty :" + BinaryBlocksToString(seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); ReadsKmerBuffer.add(s); } } }else{ - System.out.println("Findmatch read without k-mer yet: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); + // System.out.println("Findmatch read without k-mer yet: " + BinaryBlocksToString( seq2array(s.getSeq(0)) ) + " ID " + s.getLong(1) + " index/mark " + s.getLong(2)); ReadsKmerBuffer.add(s); } } } - for (int i=0; i< ReadsAndIndices.size(); i++){ - System.out.println("Matched 2x k-mer read ID: " + ReadsAndIndices.get(i).getLong(0) + " and its index: " + ReadsAndIndices.get(i).getLong(1) ); - } + // for (int i=0; i< ReadsAndIndices.size(); i++){ + // System.out.println("Matched 2x k-mer read ID: " + ReadsAndIndices.get(i).getLong(0) + " and its index: " + ReadsAndIndices.get(i).getLong(1) ); + // } return ReadsAndIndices.iterator(); } @@ -1700,8 +1700,8 @@ public Iterator call(Iterator> s) { long[] nucleotideBinarySlot = new long[param.subKmerBinarySlots]; long[] nucleotideBinaryReverseComplementSlot = new long[param.subKmerBinarySlots]; - System.out.println("subKmerBinarySlots: " + param.subKmerBinarySlots); - System.out.println("subKmerSizeResidue: " + kmerResidue); + // System.out.println("subKmerBinarySlots: " + param.subKmerBinarySlots); + // System.out.println("subKmerSizeResidue: " + kmerResidue); for (int i = param.frontClip; i < readLength - param.endClip; i++) { nucleotide = read.charAt(i); @@ -1737,7 +1737,8 @@ public Iterator call(Iterator> s) { // the rest for (int j = param.subKmerBinarySlots - 2; j >= 0; j--) { - transitBit2 = nucleotideBinarySlot[j] >>> (2 * 31); // **--------------- -> 0000000000000** + transitBit2 = nucleotideBinarySlot[j] >>> (2 * 30); // --**--------------- -> 00000000000--** the two bits in the head will be shift out in the code below + transitBit2 &= 3L; // 00000000000--** -> 0000000000000** nucleotideBinarySlot[j] <<= 2; // --------------- -> --------------00 nucleotideBinarySlot[j] |= transitBit1; // -------------00 -> -------------** transitBit1 = transitBit2; @@ -1776,7 +1777,8 @@ public Iterator call(Iterator> s) { } } else { // the first transition bit from the first block - long transitBit1 = nucleotideBinaryReverseComplementSlot[0] << 2 * 31; + long transitBit1 = nucleotideBinaryReverseComplementSlot[0] << 2 * 31; // -----------** -> **0000000000000 + transitBit1 >>>=2; // **0000000000000 -> 00**00000000000 long transitBit2; nucleotideBinaryReverseComplementSlot[0] >>>= 2; @@ -1789,6 +1791,7 @@ public Iterator call(Iterator> s) { for (int j = 1; j < param.subKmerBinarySlots - 1; j++) { transitBit2 = nucleotideBinaryReverseComplementSlot[j] << 2 * 31; + transitBit2 >>>=2; // **0000000000000 -> 00**00000000000 nucleotideBinaryReverseComplementSlot[j] >>>= 2; // transitBit1 <<= 2*31; nucleotideBinaryReverseComplementSlot[j] |= transitBit1; @@ -1797,7 +1800,7 @@ public Iterator call(Iterator> s) { if (param.subKmerBinarySlots>1) { // if param.subKmerBinarySlots =1 , then the above loop will not happen, nucleotideBinaryReverseComplementSlot[0] will leff shift twice nucleotideBinaryReverseComplementSlot[param.subKmerBinarySlots - 1] >>>= 2; - transitBit1 >>>= 2 * (31 - kmerResidue + 1); + transitBit1 >>>= 2 * (31 - kmerResidue ); // 00**00000000000 -> 0000000**----- where length(-----) == kmerResidue -1 nucleotideBinaryReverseComplementSlot[param.subKmerBinarySlots - 1] |= transitBit1; } } @@ -1809,7 +1812,7 @@ public Iterator call(Iterator> s) { nucleotideBinarySlotPreRow[j] <<= 2;// --*************** -> ****************-- } - nucleotideBinarySlotPreRow[nucleotideBinarySlotPreRow.length - 1] >>>= 2; // above in the loop, the last slot left shitted 2 bits + nucleotideBinarySlotPreRow[nucleotideBinarySlotPreRow.length - 1] >>>= 2; // above in the loop, the last slot left shifted 2 bits nucleotideBinarySlotPreRow[nucleotideBinarySlotPreRow.length - 1] <<= 2 * (32 - kmerResidue); nucleotideBinarySlotPreRow[nucleotideBinarySlotPreRow.length - 1] |= (1L << 2 * (32 - kmerResidue - 1)); // add C marker // kmerList.add(RowFactory.create(nucleotideBinarySlotPreRow, ID, (long) i)); // the number does not matter, as the count is based on units @@ -1834,17 +1837,17 @@ public Iterator call(Iterator> s) { // ID=Long.MAX_VALUE; // } - System.out.println("forward k-mer extracted from Read: " + kmerReadExtracted + " ID " + ID + " index: " + ((long) i - param.kmerSize + 1)); + // System.out.println("forward k-mer extracted from Read: " + kmerReadExtracted + " ID " + ID + " index: " + ((long) i - param.kmerSize + 1)); long b = -ID; // long c = -b; // System.out.println("reverse k-mer extracted from Read: " + kmerRcReadExtracted + " ID " + b + " index: " + (long) (readLength - i - 1) + " test negative: " + c); - System.out.println("reverse k-mer extracted from Read: " + kmerRcReadExtracted + " ID " + ID + " index: " + ((long) i - param.kmerSize + 1)); + // System.out.println("reverse k-mer extracted from Read: " + kmerRcReadExtracted + " ID " + ID + " index: " + ((long) i - param.kmerSize + 1)); if (compareLongArrayBlocks(nucleotideBinarySlotPreRow, nucleotideBinaryReverseComplementSlotPreRow) == true) { - System.out.println("Choose : " + BinaryBlocksToString(nucleotideBinarySlotPreRow)); + // System.out.println("Choose : " + BinaryBlocksToString(nucleotideBinarySlotPreRow)); kmerList.add(RowFactory.create(nucleotideBinarySlotPreRow, ID, (long) (i - param.kmerSize + 1))); // the number does not matter, as the count is based on units }else{ - System.out.println("Choose : " + BinaryBlocksToString(nucleotideBinaryReverseComplementSlotPreRow)); + // System.out.println("Choose : " + BinaryBlocksToString(nucleotideBinaryReverseComplementSlotPreRow)); kmerList.add(RowFactory.create(nucleotideBinaryReverseComplementSlotPreRow, ID, (long) (i - param.kmerSize + 1))); // update 2, index is set to the same forward index // update, set reverse complement to the same read, as only low bit k-mer is used now. Later both forward and reverse complement mercy k-mer are used diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java index 7b4b86a..98cf7cb 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java @@ -176,10 +176,11 @@ public void assemblyFromKmer() { KmerBinaryCountDS = KmerCountDS.mapPartitions(DSBinarizer, KmerBinaryCountEncoder); KmerBinaryCountDS = KmerBinaryCountDS.filter(col("count") - .geq(param.minKmerCoverage) - .and(col("count") + // temporarily for mercy k-mers + // .geq(param.minKmerCoverage) + // .and(col("count") .leq(param.maxKmerCoverage) - ) + // ) ); @@ -451,7 +452,8 @@ public Iterator call(Iterator s) { int highestLeftMarker = getLeftMarker(HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(1)); if (subKmerSlotComparator(subKmer.getSeq(0), HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getSeq(0)) == true) { if (leftMarker > highestLeftMarker) { - if (highestLeftMarker <= param.minErrorCoverage && leftMarker >= param.minRepeatFold * highestLeftMarker) { // should use rightMarker here . However, since in the beginning, left and right are the same as coverage, it does not matter + // highestLeftMarker > 1 is for mercy k-mer with only 1 coverage + if (highestLeftMarker <= param.minErrorCoverage && leftMarker >= param.minRepeatFold * highestLeftMarker && highestLeftMarker > 1) { // should use rightMarker here . However, since in the beginning, left and right are the same as coverage, it does not matter attribute = buildingAlongFromThreeInt(reflexivMarker, leftMarker, -1); HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) @@ -480,7 +482,7 @@ public Iterator call(Iterator s) { ); } } else { - if (leftMarker <= param.minErrorCoverage && highestLeftMarker >= param.minRepeatFold * leftMarker) { + if (leftMarker <= param.minErrorCoverage && highestLeftMarker >= param.minRepeatFold * leftMarker && leftMarker > 1) { subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); reflexivMarker=getReflexivMarker(subKmer.getLong(1)); leftMarker=getLeftMarker(subKmer.getLong(1)); @@ -702,7 +704,7 @@ public Iterator call(Iterator s) { int highestLeftMarker = getLeftMarker(HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(1)); if (subKmerSlotComparator(subKmer.getSeq(0), HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getSeq(0)) == true) { if (leftMarker > HighCoverLastCoverage) { - if (HighCoverLastCoverage <= param.minErrorCoverage && leftMarker >= param.minRepeatFold * HighCoverLastCoverage) { + if (HighCoverLastCoverage <= param.minErrorCoverage && leftMarker >= param.minRepeatFold * HighCoverLastCoverage && HighCoverLastCoverage >1) { HighCoverLastCoverage = leftMarker; attribute = buildingAlongFromThreeInt(reflexivMarker, -1, rightMarker); HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, @@ -741,7 +743,7 @@ public Iterator call(Iterator s) { ); } } else { - if (leftMarker <= param.minErrorCoverage && HighCoverLastCoverage >= param.minRepeatFold * leftMarker) { + if (leftMarker <= param.minErrorCoverage && HighCoverLastCoverage >= param.minRepeatFold * leftMarker && leftMarker > 1) { subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); reflexivMarker=getReflexivMarker(subKmer.getLong(1));