From a214966e829ed6c598af7acf312ec3fb914c3800 Mon Sep 17 00:00:00 2001 From: rhinempi Date: Wed, 13 Sep 2023 22:48:51 +0200 Subject: [PATCH] Merge overlapped paired end reads --- bin/reflexiv | 28 +- .../bielefeld/cmg/reflexiv/main/MainMeta.java | 2 +- .../cmg/reflexiv/main/MainOfMercy.java | 2 +- .../cmg/reflexiv/main/MainOfMerger.java | 2 +- .../reflexiv/main/MainOfPreProcessing.java | 2 +- .../cmg/reflexiv/main/MainOfReduce.java | 2 +- .../cmg/reflexiv/main/MainOfStitch.java | 2 +- .../reflexiv/pipeline/ReflexivCounter.java | 2 - .../pipeline/ReflexivDSDynamicKmer64.java | 7 +- .../ReflexivDSDynamicKmerFixingRoundTwo.java | 6 - .../ReflexivDSDynamicKmerIteration.java | 3 - .../ReflexivDSDynamicKmerPatching.java | 35 - .../ReflexivDSDynamicKmerRuduction.java | 5197 +++++------------ .../pipeline/ReflexivDSDynamicMercyKmer.java | 4218 +++++++++++++ .../ReflexivDSKmerLeftAndRightSorting.java | 35 +- .../ReflexivDSKmerLeftAndRightSortingOld.java | 160 +- .../pipeline/ReflexivDSKmerProcessing64.java | 487 -- .../cmg/reflexiv/pipeline/ReflexivDSMain.java | 16 +- .../reflexiv/pipeline/ReflexivDSMain64.java | 170 +- .../pipeline/ReflexivDSMainMercy.java | 76 - .../pipeline/ReflexivDSMainMeta64.java | 17 +- .../reflexiv/pipeline/ReflexivDSMerger.java | 980 +--- .../pipeline/ReflexivDSReAssembler64.java | 61 +- .../pipeline/ReflexivDSStitchingLonger.java | 5 +- .../pipeline/ReflexivDataFrameCounter.java | 47 +- .../pipeline/ReflexivDataFrameCounter64.java | 217 +- .../ReflexivDataFrameDecompresser.java | 296 +- .../ReflexivDataFrameReAssembleCounter64.java | 105 - 28 files changed, 5923 insertions(+), 6257 deletions(-) mode change 100755 => 100644 bin/reflexiv create mode 100644 src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java diff --git a/bin/reflexiv b/bin/reflexiv old mode 100755 new mode 100644 index 699f46f..5e058a5 --- a/bin/reflexiv +++ b/bin/reflexiv @@ -23,8 +23,8 @@ #----------------------------------------------------------------------------- name="Reflexiv" -version="0.8" -spark_version="2.0.0" # only for auto downloading Spark package +version="1.0" +spark_version="3.2.1" # only for auto downloading Spark package readlink -f 1>/dev/null 2>/dev/null readlinkReturn=$? @@ -48,7 +48,7 @@ SH_SBIN=${SH_SBIN:-$SH_HOME/sbin} SH_OPT="" SH_URL="https://github.com/rhinempi/reflexiv/archive/latest.zip" -SPARK_URL="http://d3kbcqa49mib13.cloudfront.net/spark-$spark_version-bin-hadoop2.6.tgz" # 1.6.0 version +SPARK_URL="https://archive.apache.org/dist/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.2.tgz" # 1.6.0 version SPARK_CMD="" # in case no spark home was found, please manually paste the "spark-submit" file path SPARK_OPT="" @@ -80,6 +80,8 @@ function die() { function get_spark() { if command -v wget &>/dev/null; then + echo $1 + echo $2 GET="wget -q $1 -O $2" elif command -v curl &>/dev/null; then GET="curl $1 -o $2" @@ -158,12 +160,12 @@ fi SPARK_CMD="$(which spark-submit)" elif [ -x "$SPARK_CMD" ]; then continue - elif [ -x "$SH_PACKAGE/spark-$spark_version-bin-hadoop2.6/bin/spark-submit" ]; then - SPARK_CMD="$SH_PACKAGE/spark-$spark_version-bin-hadoop2.6/bin/spark-submit" + elif [ -x "$SH_PACKAGE/spark-$spark_version-bin-hadoop3.2/bin/spark-submit" ]; then + SPARK_CMD="$SH_PACKAGE/spark-$spark_version-bin-hadoop3.2/bin/spark-submit" else - get_spark "$SPARK_URL" "$SH_PACKAGE/spark-$spark_version-bin-hadoop2.6.tgz" - untar_spark "$SH_PACKAGE/spark-$spark_version-bin-hadoop2.6.tgz" "$SH_PACKAGE" - SPARK_CMD= "$SH_PACKAGE/spark-$spark_version-bin-hadoop2.6/bin/spark-submit" + get_spark "$SPARK_URL" "$SH_PACKAGE/spark-$spark_version-bin-hadoop3.2.tgz" + untar_spark "$SH_PACKAGE/spark-$spark_version-bin-hadoop3.2.tgz" "$SH_PACKAGE" + SPARK_CMD= "$SH_PACKAGE/spark-$spark_version-bin-hadoop3.2/bin/spark-submit" fi # Verify reflexiv jar is available @@ -182,6 +184,8 @@ function dump_help() { echo " run Run the entire assembly pipeline" echo " counter counting Kmer frequency" echo " reassembler re-assemble and extend genome fragments" + echo " meta assemble metagenomes" + echo " reduce Dynamic reduction of k-mers" echo "" echo "Type each command to view its options, eg. Usage: ./reflexiv run" echo "" @@ -218,7 +222,7 @@ function parse_param() { if [[ ${args[$i+1]} == -* ]]; then SPARK_OPT+="${args[$i]} " else - SPARK_OPT+="${args[$i]} ${args[$i+1]} " + SPARK_OPT+="${args[$i]} \"${args[$i+1]}\" " fi elif [[ ${args[$i]} == -* ]]; then if [[ ${args[$i+1]} == -* ]]; then @@ -259,17 +263,19 @@ elif [[ ${MODULE} == "merger" ]]; then mainClass="uni.bielefeld.cmg.reflexiv.main.MainOfMerger" elif [[ ${MODULE} == "mercy" ]]; then mainClass="uni.bielefeld.cmg.reflexiv.main.MainOfMercy" +elif [[ ${MODULE} == "preprocess" ]]; then + mainClass="uni.bielefeld.cmg.reflexiv.main.MainOfPreProcessing" else dump_help exit 1; fi # Assemble the command line -cmdline="$SPARK_CMD $SPARK_OPT --class $mainClass $SH_JAR $SH_OPT" +cmdline="$SPARK_CMD $SPARK_OPT --jars $SH_LIB/hadoop-4mc-3.0.0.jar --files $SH_SBIN/flash --class $mainClass $SH_JAR $SH_OPT" # launch command function launch_reflexiv() { exec bash -c "exec $cmdline" } -launch_reflexiv \ No newline at end of file +launch_reflexiv diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainMeta.java b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainMeta.java index e4cb402..cf686e9 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainMeta.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainMeta.java @@ -56,7 +56,7 @@ public class MainMeta { */ public static void main(String[] args) throws IOException { InfoDumper info = new InfoDumper(); - info.readParagraphedMessages("Reflexiv main initiating ... \ninterpreting parameters."); + info.readParagraphedMessages("Reflexiv assembly initiating ... \ninterpreting parameters."); info.screenDump(); Parameter parameter = null; diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMercy.java b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMercy.java index d089a24..0ae5e0f 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMercy.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMercy.java @@ -56,7 +56,7 @@ public class MainOfMercy { */ public static void main(String[] args){ InfoDumper info = new InfoDumper(); - info.readParagraphedMessages("Reflexiv main initiating ... \ninterpreting parameters."); + info.readParagraphedMessages("Reflexiv mercy-kmer initiating ... \ninterpreting parameters."); info.screenDump(); Parameter parameter = null; diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMerger.java b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMerger.java index 25c89ec..3307d3f 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMerger.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfMerger.java @@ -56,7 +56,7 @@ public class MainOfMerger { */ public static void main(String[] args){ InfoDumper info = new InfoDumper(); - info.readParagraphedMessages("Reflexiv main initiating ... \ninterpreting parameters."); + info.readParagraphedMessages("Reflexiv contig merger initiating ... \ninterpreting parameters."); info.screenDump(); Parameter parameter = null; diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfPreProcessing.java b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfPreProcessing.java index fa860ef..0aea351 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfPreProcessing.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfPreProcessing.java @@ -57,7 +57,7 @@ public class MainOfPreProcessing { */ public static void main(String[] args) throws IOException { InfoDumper info = new InfoDumper(); - info.readParagraphedMessages("Reflexiv main initiating ... \ninterpreting parameters."); + info.readParagraphedMessages("Reflexiv reads preprocessing initiating ... \ninterpreting parameters."); info.screenDump(); Parameter parameter = null; diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfReduce.java b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfReduce.java index c016f92..39d0b21 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfReduce.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfReduce.java @@ -56,7 +56,7 @@ public class MainOfReduce { */ public static void main(String[] args) throws IOException { InfoDumper info = new InfoDumper(); - info.readParagraphedMessages("Reflexiv main initiating ... \ninterpreting parameters."); + info.readParagraphedMessages("Reflexiv k-mer reduction initiating ... \ninterpreting parameters."); info.screenDump(); Parameter parameter = null; diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfStitch.java b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfStitch.java index a41f730..aedb384 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfStitch.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/main/MainOfStitch.java @@ -56,7 +56,7 @@ public class MainOfStitch { */ public static void main(String[] args) throws IOException { InfoDumper info = new InfoDumper(); - info.readParagraphedMessages("Reflexiv main initiating ... \ninterpreting parameters."); + info.readParagraphedMessages("Reflexiv contig stitching initiating ... \ninterpreting parameters."); info.screenDump(); Parameter parameter = null; diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivCounter.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivCounter.java index f6cff2b..622aa88 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivCounter.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivCounter.java @@ -125,8 +125,6 @@ public void assembly(){ JavaPairRDD> ReflexivLongSubKmerRDD; JavaPairRDD> ReflexivSubKmerStringRDD; // Generates strings, for testing - // JavaPairRDD> ForwardSubKmerRDD; - // JavaPairRDD> ReflectedSubKmerRDD; JavaPairRDD ContigTuple2RDD; JavaPairRDD, Long> ContigTuple2IndexRDD; diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmer64.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmer64.java index cb7bedc..d8769e1 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmer64.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmer64.java @@ -153,12 +153,7 @@ public void assembly() { kmerCountTupleStruct = kmerCountTupleStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); kmerCountTupleStruct = kmerCountTupleStruct.add("count", DataTypes.IntegerType, false); ExpressionEncoder KmerBinaryCountEncoder = RowEncoder.apply(kmerCountTupleStruct); -/* - StructType kmerBinaryStruct = new StructType(); - kmerBinaryStruct = kmerBinaryStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); - kmerBinaryStruct = kmerBinaryStruct.add("count", DataTypes.IntegerType, false); - ExpressionEncoder kmerBinaryEncoder = RowEncoder.apply(kmerBinaryStruct); -*/ + Dataset ReflexivSubKmerDS; StructType ReflexivKmerStruct = new StructType(); ReflexivKmerStruct = ReflexivKmerStruct.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFixingRoundTwo.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFixingRoundTwo.java index 70ceb3f..6f2e757 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFixingRoundTwo.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerFixingRoundTwo.java @@ -157,12 +157,6 @@ public void assemblyFromKmer() { ReflexivFixingKmerStruct= ReflexivFixingKmerStruct.add("extension", DataTypes.createArrayType(DataTypes.LongType), false); ExpressionEncoder ReflexivFixingKmerEndocer = RowEncoder.apply(ReflexivFixingKmerStruct); - StructType ReflexivLongKmerStructCompressed = new StructType(); - ReflexivLongKmerStructCompressed= ReflexivLongKmerStructCompressed.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); - ReflexivLongKmerStructCompressed= ReflexivLongKmerStructCompressed.add("attribute", DataTypes.LongType, false); - ReflexivLongKmerStructCompressed= ReflexivLongKmerStructCompressed.add("extension", DataTypes.createArrayType(DataTypes.LongType), false); - ExpressionEncoder ReflexivLongSubKmerEncoderCompressed = RowEncoder.apply(ReflexivLongKmerStructCompressed); - StructType ContigLongKmerStringStruct = new StructType(); ContigLongKmerStringStruct = ContigLongKmerStringStruct.add("ID", DataTypes.StringType, false); ContigLongKmerStringStruct = ContigLongKmerStringStruct.add("contig", DataTypes.StringType, false); diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerIteration.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerIteration.java index c87f29c..badb9f5 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerIteration.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerIteration.java @@ -178,9 +178,6 @@ public void assemblyFromKmer() { DynamicKmerBinarizerFromReducedToSubKmer ReducedKmerToSubKmer= new DynamicKmerBinarizerFromReducedToSubKmer(); ReflexivLongSubKmerDS = KmerCountDS.mapPartitions(ReducedKmerToSubKmer, ReflexivLongSubKmerEncoderCompressed); - // DSkmerRandomReflection DSrandomizeSubKmer = new DSkmerRandomReflection(); - // ReflexivSubKmerDS = ReflexivSubKmerDS.mapPartitions(DSrandomizeSubKmer, ReflexivSubKmerEncoderCompressed); - DSExtendReflexivKmerToArrayLoop DSKmerExtenstionArrayToArray = new DSExtendReflexivKmerToArrayLoop(); diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerPatching.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerPatching.java index 56c7209..caa0f99 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerPatching.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerPatching.java @@ -155,14 +155,6 @@ private Hashtable, Integer> SubKmerProbRowToHash(List s){ * */ public void assemblyFromKmer() throws IOException { - /* SparkConf conf = setSparkConfiguration(); - info.readMessage("Initiating Spark context ..."); - info.screenDump(); - info.readMessage("Start Spark framework"); - info.screenDump(); - JavaSparkContext sc = new JavaSparkContext(conf); -*/ - SparkSession spark = setSparkSessionConfiguration(param.shufflePartition); info.readMessage("Initiating Spark SQL context ..."); @@ -233,12 +225,6 @@ public void assemblyFromKmer() throws IOException { FastqDSTuple = spark.createDataset(FastqIndex.rdd(), Encoders.tuple(Encoders.STRING(), Encoders.LONG())); FastqDSTuple.persist(StorageLevel.DISK_ONLY()); -/* - FastqDSTuple.write(). - mode(SaveMode.Overwrite). - format("csv"). - option("compression", "gzip").save(param.outputPath + "/Assembly_intermediate/ZippedFastqForDebug"); -*/ ReverseComplementKmerBinaryExtractionFromDataset DSExtractRCKmerBinaryFromFastq = new ReverseComplementKmerBinaryExtractionFromDataset(); @@ -271,9 +257,6 @@ public void assemblyFromKmer() throws IOException { ContigSeedDS = ContigSeedDS.union(ReadSeedDS); ContigSeedDS = ContigSeedDS.sort("seed"); - // long contigSeedPartition = ContigSeedDS.javaRDD().getNumPartitions(); - // long contigSeedSize = ContigSeedDS.javaRDD().count(); - // System.out.println("ContigSeed partition: " + contigSeedPartition + " and count: " + contigSeedSize); Dataset RACpairDS; StructType RACPairStruct = new StructType(); @@ -287,10 +270,6 @@ public void assemblyFromKmer() throws IOException { RACpairDS= RACpairDS.sort("read", "contig"); - // long RACpairPartition = RACpairDS.javaRDD().getNumPartitions(); - // long RACpairSize = RACpairDS.javaRDD().count(); - // System.out.println("RACpair partitions: " + RACpairPartition + " and count: " + RACpairSize); - Dataset CCPairDS; StructType CCPairStruct = new StructType(); CCPairStruct = CCPairStruct.add("left", DataTypes.LongType, false); @@ -301,9 +280,6 @@ public void assemblyFromKmer() throws IOException { CreatCCPairs matchContigToContig = new CreatCCPairs(); CCPairDS = RACpairDS.mapPartitions(matchContigToContig, CCPairEncoder); - // long ccpairPartition1 = CCPairDS.javaRDD().getNumPartitions(); - // long ccpairSize1 = CCPairDS.javaRDD().count(); - // System.out.println("ccpair partitions 1: " + ccpairPartition1 + " and count: " + ccpairSize1); CCPairDS = CCPairDS.sort("left", "right"); @@ -319,9 +295,6 @@ public void assemblyFromKmer() throws IOException { CCPairDS= CCPairDS.mapPartitions(filterForCCpair,CCPairEncoderCount); CCPairDS=CCPairDS.sort(col("right").asc(), col("count").desc()); - // long ccpairPartition = CCPairDS.javaRDD().getNumPartitions(); - // long ccpairSize = CCPairDS.javaRDD().count(); - // System.out.println("ccpair partitions 2: " + ccpairPartition + " and count: " + ccpairSize); Dataset MarkedReads; StructType CCNetStruct = new StructType(); @@ -340,10 +313,6 @@ public void assemblyFromKmer() throws IOException { MarkedReads = MarkedReads.sort("read"); - // long readPartitions= MarkedReads.javaRDD().getNumPartitions(); - // long readSize = MarkedReads.javaRDD().count(); - // System.out.println("read partitions:" + readPartitions + " and count: " + readSize); - Dataset CCNetWithSeq; StructType ContigSeqStruct = new StructType(); ContigSeqStruct = ContigSeqStruct.add("ID", DataTypes.LongType, false); @@ -356,10 +325,6 @@ public void assemblyFromKmer() throws IOException { CCNetWithSeq = CCNetWithSeq.union(markerTupleRow); CCNetWithSeq= CCNetWithSeq.sort("ID"); - // long ccnetParitions = CCNetWithSeq.javaRDD().getNumPartitions(); - // long ccnetSize = CCNetWithSeq.javaRDD().getNumPartitions(); - // System.out.println("ccnet partitions:" + ccnetParitions + " and count: " + ccnetSize); - Dataset reflexivKmer; StructType ReflexivLongKmerStructCompressed = new StructType(); ReflexivLongKmerStructCompressed= ReflexivLongKmerStructCompressed.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerRuduction.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerRuduction.java index ea87583..9f92ebc 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerRuduction.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicKmerRuduction.java @@ -63,6 +63,7 @@ */ public class ReflexivDSDynamicKmerRuduction implements Serializable { private long time; + private DefaultParam param; private InfoDumper info = new InfoDumper(); @@ -155,25 +156,6 @@ public void assemblyFromKmer() { Dataset KmerCountDS; Dataset LongerKmerCountDS; - Dataset KmerBinaryCountDS; - DatasetLongerKmerBinaryCountDS; - - StructType kmerCountTupleStruct = new StructType(); - kmerCountTupleStruct = kmerCountTupleStruct.add("kmer", DataTypes.createArrayType(DataTypes.LongType), false); - kmerCountTupleStruct = kmerCountTupleStruct.add("count", DataTypes.IntegerType, false); - ExpressionEncoder KmerBinaryCountEncoder = RowEncoder.apply(kmerCountTupleStruct); - - Dataset ReflexivSubKmerDS; - Dataset LongerReflexivSubKmerDS; - StructType ReflexivKmerStruct = new StructType(); - ReflexivKmerStruct = ReflexivKmerStruct.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); - ReflexivKmerStruct = ReflexivKmerStruct.add("reflection", DataTypes.IntegerType, false); - ReflexivKmerStruct = ReflexivKmerStruct.add("extension", DataTypes.LongType, false); - ReflexivKmerStruct = ReflexivKmerStruct.add("left", DataTypes.IntegerType, false); - ReflexivKmerStruct = ReflexivKmerStruct.add("right", DataTypes.IntegerType, false); - ExpressionEncoder ReflexivSubKmerEncoder = RowEncoder.apply(ReflexivKmerStruct); - - Dataset ReflexivSubKmerDSCompressed; StructType ReflexivKmerStructCompressedStruct = new StructType(); ReflexivKmerStructCompressedStruct = ReflexivKmerStructCompressedStruct.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); ReflexivKmerStructCompressedStruct = ReflexivKmerStructCompressedStruct.add("reflection", DataTypes.LongType, false); @@ -197,17 +179,12 @@ public void assemblyFromKmer() { ReflexivFullKmerStringStruct = ReflexivFullKmerStringStruct.add("reflection", DataTypes.StringType, false); ExpressionEncoder ReflexivFullKmerStringEncoder = RowEncoder.apply(ReflexivFullKmerStringStruct); - - - - /** * loading Kmer counts */ KmerCountDS = spark.read().csv(param.inputKmerPath1); LongerKmerCountDS = spark.read().csv(param.inputKmerPath2); - if (param.partitions > 0) { KmerCountDS = KmerCountDS.repartition(param.partitions); } @@ -248,25 +225,12 @@ public void assemblyFromKmer() { MixedFullKmerDS = MixedFullKmerDS.sort("k"); -/* - LongerKmerEnlightening LongerKmerEnlightment = new LongerKmerEnlightening(); - MixedFullKmerDS = MixedFullKmerDS.mapPartitions(LongerKmerEnlightment,ReflexivFullKmerEncoder); - MixedFullKmerDS = MixedFullKmerDS.sort("k"); -*/ ShorterKmerNeutralization SKNeutralizer = new ShorterKmerNeutralization(); MixedFullKmerDS = MixedFullKmerDS.mapPartitions(SKNeutralizer, ReflexivFullKmerEncoder); MixedFullKmerDS.persist(StorageLevel.DISK_ONLY()); - // if(param.partitions>10) { - // MixedFullKmerDS = MixedFullKmerDS.coalesce(param.partitions - 1); - // MixedFullKmerDS = MixedFullKmerDS.mapPartitions(SKNeutralizer, ReflexivFullKmerEncoder); - // } - - // MixedFullKmerDS.persist(StorageLevel.DISK_ONLY()); - // MixedFullKmerDS.show(); - /** * */ @@ -290,63 +254,42 @@ public void assemblyFromKmer() { save(param.outputPath + "/Count_" + param.kmerSize1 + "_reduced"); } - // if (param.kmerSize2<100) { // longer than 100 will not be enlightened by shorter k-mer - if (param.kmerSize2 == param.kmerListInt[param.kmerListInt.length - 1]) { - if (param.gzip) { - DSFullKmerStringLong.write(). - mode(SaveMode.Overwrite). - format("csv"). - option("codec", "org.apache.hadoop.io.compress.GzipCodec"). - save(param.outputPath + "/Count_" + param.kmerSize2 + "_reduced"); - } else { - DSFullKmerStringLong.write(). - mode(SaveMode.Overwrite). - format("csv"). - save(param.outputPath + "/Count_" + param.kmerSize2 + "_reduced"); - } + if (param.kmerSize2 == param.kmerListInt[param.kmerListInt.length - 1]) { + if (param.gzip) { + DSFullKmerStringLong.write(). + mode(SaveMode.Overwrite). + format("csv"). + option("codec", "org.apache.hadoop.io.compress.GzipCodec"). + save(param.outputPath + "/Count_" + param.kmerSize2 + "_reduced"); } else { - if (param.gzip) { - DSFullKmerStringLong.write(). - mode(SaveMode.Overwrite). - format("csv"). - option("codec", "org.apache.hadoop.io.compress.GzipCodec"). - save(param.outputPath + "/Count_" + param.kmerSize2 + "_sorted"); - } else { - DSFullKmerStringLong.write(). - mode(SaveMode.Overwrite). - format("csv"). - save(param.outputPath + "/Count_" + param.kmerSize2 + "_sorted"); - } + DSFullKmerStringLong.write(). + mode(SaveMode.Overwrite). + format("csv"). + save(param.outputPath + "/Count_" + param.kmerSize2 + "_reduced"); + } + } else { + if (param.gzip) { + DSFullKmerStringLong.write(). + mode(SaveMode.Overwrite). + format("csv"). + option("codec", "org.apache.hadoop.io.compress.GzipCodec"). + save(param.outputPath + "/Count_" + param.kmerSize2 + "_sorted"); + } else { + DSFullKmerStringLong.write(). + mode(SaveMode.Overwrite). + format("csv"). + save(param.outputPath + "/Count_" + param.kmerSize2 + "_sorted"); } - // } - - spark.stop(); - } - - - - - - class TagContigID implements FlatMapFunction, Long>, String>, Serializable { - - public Iterator call(Tuple2, Long> s) { - - - List contigList = new ArrayList(); + } - contigList.add(s._1._1 + "-" + s._2 + "\n" + s._1._2); - return contigList.iterator(); - } + spark.stop(); } - class DSBinaryFullKmerArrayToStringShort implements MapPartitionsFunction, Serializable { List reflexivKmerStringList = new ArrayList(); public Iterator call(Iterator sIterator) { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp + "RepeatCheck DSBinaryFullKmerArrayToStringShort: " + param.kmerSize1); while (sIterator.hasNext()) { Row s = sIterator.next(); @@ -366,7 +309,6 @@ public Iterator call(Iterator sIterator) { ) ); - // System.out.println("final final leftMarker: " + getLeftMarker(s.getLong(1))); } // else not return } @@ -460,8 +402,6 @@ class DSBinaryFullKmerArrayToStringLong implements MapPartitionsFunction reflexivKmerStringList = new ArrayList(); public Iterator call(Iterator sIterator) { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp + "RepeatCheck DSBinaryFullKmerArrayToStringLong: " + param.kmerSize1); while (sIterator.hasNext()) { Row s = sIterator.next(); @@ -566,90 +506,177 @@ private int getRightMarker(long attribute){ } - class LongerKmerToEnglightenKmer implements MapPartitionsFunction, Serializable{ - List LongToShortReflexedSubKmer = new ArrayList(); + class RightLongerToShorterComparisonAndNeutralizationPreparation implements MapPartitionsFunction, Serializable{ + List kmerList= new ArrayList(); + + public Iterator call(Iterator s) throws Exception { + long[] seqBlocks; + int subKmerLength; + long[] subKmer; + long[] extension=new long[1]; + long attribute; + long[] combinedBlock; + + while (s.hasNext()){ + Row fullKmer=s.next(); - // xxxxxxxxxxxxxxx x --> ----xxxxxxxxxxxx xxxx - // ----xxxxxxxxxxx x --> ----xxxxxxxxxxx x + seqBlocks=seq2array(fullKmer.getSeq(0)); + subKmerLength=currentKmerSizeFromBinaryBlockArray(seqBlocks); - public Iterator call(Iterator s) throws Exception{ - while (s.hasNext()) { - Row subKmer = s.next(); - - long[] SubKmerArray = seq2array(subKmer.getSeq(0)); - long[] ReflexivKmerArray = new long[1]; - ReflexivKmerArray[0]=subKmer.getLong(2); - long[] fullKmer; - long[] newFullKmer; - long[] newSubKmer; - long[] newReflexivKmer; - int leftShiftLength = param.kmerSize2-param.kmerSize1; - if (getReflexivMarker(subKmer.getLong(1))==1){ - fullKmer = combineTwoLongBlocks(SubKmerArray, ReflexivKmerArray); + subKmer=reverseBinaryBlocks(seqBlocks); + extension[0]=fullKmer.getLong(2); + + if (getReflexivMarker(fullKmer.getLong(1))==1){ + combinedBlock=combineTwoLongBlocks(subKmer, extension); }else{ - fullKmer = combineTwoLongBlocks(ReflexivKmerArray, SubKmerArray); + combinedBlock=combineTwoLongBlocks(extension, subKmer); } - newSubKmer= leftShiftArray(fullKmer, leftShiftLength); // llllllrrrrrrrrrrrrr - > rrrrrrrrrrrrrr - newReflexivKmer = leftShiftOutFromArray(fullKmer, leftShiftLength); // lllllllrrrrrrrrrrr-> lllllll - newFullKmer = combineTwoLongBlocks(newSubKmer, newReflexivKmer); // -> rrrrrrrrrrrrr_lllllll + subKmer = leftShiftArray(combinedBlock, 1); + extension = leftShiftOutFromArray(combinedBlock, 1); - long attribute= onlyChangeReflexivMarker(subKmer.getLong(1), 1); // all enlighten k-mer are concatenated full reflexed k-mers, mark 1 just for now - LongToShortReflexedSubKmer.add(RowFactory.create(newFullKmer,attribute)); + attribute=onlyChangeReflexivMarker(fullKmer.getLong(1), 2); - // String beforeFullKmer = BinaryBlocksToString(fullKmer); - // String newFullKmerString= BinaryBlocksToString(newFullKmer); - //System.out.println(param.kmerSize2 + " before: " + beforeFullKmer + " after: " + newFullKmerString); + kmerList.add(RowFactory.create(subKmer, attribute, extension[0])); } - return LongToShortReflexedSubKmer.iterator(); + return kmerList.iterator(); } - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); + private long[] reverseBinaryBlocks(long[] blocks){ + int length = currentKmerSizeFromBinaryBlockArray(blocks); + int blockNumber= blocks.length; + long[] newBlocks= new long[blockNumber]; + int reverseIndex; + int reverseBlockIndex; + int relativeReverseIndex; + + int forwardBlockIndex; + + long twoBits; + for (int i=0; i>>2*(31-relativeReverseIndex); + twoBits&=3L; + + newBlocks[forwardBlockIndex]|=twoBits; + newBlocks[forwardBlockIndex] <<=2; } - return array; + int lastBlockShift=31-(length-1)%31-1; + newBlocks[newBlocks.length-1] <<=2*lastBlockShift; + newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); + + return newBlocks; } - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; + private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ + long info = (long) ReflexivMarker <<2*(32-1); //move to the left most + + /** + * shorten the int and change negative to positive to avoid two's complementary + */ + if (leftCover>=30000){ + leftCover=30000; + }else if (leftCover<=-30000){ + leftCover=30000-(-30000); + }else if (leftCover<0){ + leftCover=30000-leftCover; + } + + if (rightCover>=30000){ + rightCover=30000; + }else if (rightCover<=-30000){ + rightCover=30000-(-30000); + }else if (rightCover<0){ + rightCover=30000-rightCover; + } + + info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left + info |= ((long) rightCover); // 01--LeftCover---RightCover + + return info; } - private int getLeftMarker(long attribute){ - int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 + private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { + int startingBlockIndex = (shiftingLength)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length + int remainLength=nucleotideLength-shiftingLength-1; + if (remainLength <0){ + remainLength=0; + } + long[] newBlock = new long[remainLength/31+1]; + int relativeShiftSize = shiftingLength % 31; - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + if (shiftingLength >= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } + // if (relativeShiftSize ==0) then only shifting blocks + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary - if (leftMarker>30000){ - leftMarker=30000-leftMarker; + j++; } - return leftMarker; + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker + + return newBlock; + } - private int getRightMarker(long attribute){ - int rightMarker = (int) attribute; + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; - if (rightMarker>30000){ - rightMarker=30000-rightMarker; + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; } - return rightMarker; - } + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + + } + + return shiftOutBlocks; } private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { @@ -679,8 +706,8 @@ private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throw newBlocks[j]=rightBlocks[j-leftBlocks.length]; } }else{ - // String rightBlocksString = BinaryBlocksToString(rightBlocks); - // String leftBlocksString = BinaryBlocksToString(leftBlocks); + // String rightBlocksString = BinaryBlocksToString(rightBlocks); + // String leftBlocksString = BinaryBlocksToString(leftBlocks); long[] shiftOutBlocks = leftShiftOutFromArray(rightBlocks, leftVacancy); // right shift out for the left. here we only expect one block, because leftVacancy is relative to one block for (int i =0; ibLength){ // equal should not happen + long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); + // String longer = BinaryBlocksToString(shorterVersion); + // String shorter = BinaryBlocksToString(arrayB); + // System.out.println("longer: " + longer + " shorter: " + shorter); + // if (shorterVersion.length>=2 && arrayB.length >=2) { + // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); + //} + if (Arrays.equals(shorterVersion, arrayB)){ + // if (shorterVersion.length>=2){ + // System.out.println("marker!!!"); + // } + return true; + }else{ + return false; + } + }else{ + long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); + if (Arrays.equals(shorterVersion, arrayA)){ + return true; + }else{ + return false; + } + } + } + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + private String BinaryBlocksToString (long[] binaryBlocks){ String KmerString=""; int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); @@ -739,210 +807,115 @@ private String BinaryBlocksToString (long[] binaryBlocks){ return KmerString; } - private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { - int startingBlockIndex = (shiftingLength)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length - - int remainLength=nucleotideLength-shiftingLength-1; - if (remainLength <0){ - remainLength=0; - } - long[] newBlock = new long[remainLength/31+1]; - int relativeShiftSize = shiftingLength % 31; - - if (shiftingLength >= nucleotideLength){ - // apparantly, it is possible. meaning the block has nothing left - // throw new Exception("shifting length longer than the kmer length"); - newBlock[0]|=(1L<<2*31); //add c marker at the end - return newBlock; - } - - // if (relativeShiftSize ==0) then only shifting blocks - - int j=0; // new index for shifted blocks - // long oldShiftOut=0L; // if only one block, then 0 bits -// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex -// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); - // } - for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted - newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- - newBlock[j] |= shiftOut; - newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary - - j++; + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; } + return nucleotide; - if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block - newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; - }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end - newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 - } // else the last block has been completely shift into the new last block, including the C marker - - return newBlock; + } + private int getReflexivMarker(long attribute){ + int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker + return reflexivMarker; } - private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ - int relativeShiftSize = shiftingLength % 31; - int endingBlockIndex = (shiftingLength-1)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - long[] shiftOutBlocks = new long[endingBlockIndex+1]; + private int getLeftMarker(long attribute){ + int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker + int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 + leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker - if (shiftingLength > nucleotideLength){ - // throw new Exception("shifting length longer than the kmer length"); - return blocks; + if (leftMarker>30000){ + leftMarker=30000-leftMarker; } - for (int i=0; i 0) { - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 - shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); - }else{ // relativeShiftSize == 0; - if (endingBlockIndex+1 == blocks.length) { // a block with C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - }else{ // endingBlockIndex < blocks.length -1 means a block without C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC - } + private int getRightMarker(long attribute){ + int rightMarker = (int) attribute; + if (rightMarker>30000){ + rightMarker=30000-rightMarker; } - return shiftOutBlocks; + return rightMarker; } - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0L) { - nucleotide = 'A'; - } else if (twoBits == 1L) { - nucleotide = 'C'; - } else if (twoBits == 2L) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; + private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ + long maxSubKmerBinary = ~((~0L) << 2 * 31); + long newMarker = oldMarker & maxSubKmerBinary; + newMarker |= ((long) reflexivMarker) << 2*(32-1); + return newMarker; } } - class LongerKmerEnlightening implements MapPartitionsFunction, Serializable{ - List newFullKmerList = new ArrayList(); - Row shorterFullKmer; - List tempLongerFullKmer = new ArrayList(); + class LeftLongerToShorterComparisonPreparation implements MapPartitionsFunction, Serializable{ + List kmerList = new ArrayList(); - public Iterator call(Iterator s) throws Exception{ - while(s.hasNext()) { - Row FullKmer = s.next(); - // System.out.println(param.kmerSize2+"mark"); - long[] FullKmerArray = seq2array(FullKmer.getSeq(0)); - int FullKmerLength = currentKmerSizeFromBinaryBlockArray(FullKmerArray); + public Iterator call(Iterator s) throws Exception { + long[] seqBlocks; + int kmerLength; + long[] subKmer; + long[] extension; + long attribute; - if (FullKmerLength == param.kmerSize1) { // shorter FullKmer size = param.kmerSize1 -1 - if (shorterFullKmer != null) { // already one exists - if (tempLongerFullKmer.size() > 0) { - if (getRightMarker(shorterFullKmer.getLong(1)) > 0) { - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { - newFullKmerList.add(enlightening(tempLongerFullKmer.get(i), true)); - } else { - newFullKmerList.add(enlightening(tempLongerFullKmer.get(i), false)); - } - } - } else { // adding temp to output without changing - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - newFullKmerList.add(enlightening(tempLongerFullKmer.get(i), false)); - } - } - tempLongerFullKmer = new ArrayList(); - } - } + // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); + // System.out.println(timestamp+ "RepeatCheck LeftLongerToShorterComparisonPreparation: " + param.kmerSize1); - shorterFullKmer = FullKmer; - newFullKmerList.add(FullKmer); - } else { // it is a longer K-mer - if (shorterFullKmer == null) { - tempLongerFullKmer.add(FullKmer); - } else { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), FullKmer.getSeq(0)) == true) { - if (getRightMarker(shorterFullKmer.getLong(1)) > 0) { - newFullKmerList.add(enlightening(FullKmer, true)); - } else { - newFullKmerList.add(enlightening(FullKmer, false)); - } - } else { // longer Kmer not overlap to shorter k-mer anymore, a new round starts - if (getRightMarker(shorterFullKmer.getLong(1)) > 0) { - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { - newFullKmerList.add(enlightening(tempLongerFullKmer.get(i), true)); - } else { - newFullKmerList.add(enlightening(tempLongerFullKmer.get(i), false)); - } - } - } else { // adding temp to output without changing - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - newFullKmerList.add(enlightening(tempLongerFullKmer.get(i), false)); - } - } + while (s.hasNext()) { + Row fullKmer = s.next(); + seqBlocks= seq2array(fullKmer.getSeq(0)); + kmerLength=currentKmerSizeFromBinaryBlockArray(seqBlocks); - tempLongerFullKmer=new ArrayList(); - shorterFullKmer = null; - tempLongerFullKmer.add(FullKmer); - } - } - } - } + subKmer = leftShiftOutFromArray(seqBlocks, kmerLength-1); + extension=leftShiftArray(seqBlocks, kmerLength-1); + subKmer = reverseBinaryBlocks(subKmer); - //release the last temp units - if (shorterFullKmer==null){ - for (int i=0;i0){ - for (int i=0;i>>2*(31-relativeReverseIndex); + twoBits&=3L; + + newBlocks[forwardBlockIndex]|=twoBits; + newBlocks[forwardBlockIndex] <<=2; } - return fullKmer; + int lastBlockShift=31-(length-1)%31-1; + newBlocks[newBlocks.length-1] <<=2*lastBlockShift; + newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); + + return newBlocks; } private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ @@ -1076,8 +1049,8 @@ private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throw newBlocks[j]=rightBlocks[j-leftBlocks.length]; } }else{ - // String rightBlocksString = BinaryBlocksToString(rightBlocks); - // String leftBlocksString = BinaryBlocksToString(leftBlocks); + // String rightBlocksString = BinaryBlocksToString(rightBlocks); + // String leftBlocksString = BinaryBlocksToString(leftBlocks); long[] shiftOutBlocks = leftShiftOutFromArray(rightBlocks, leftVacancy); // right shift out for the left. here we only expect one block, because leftVacancy is relative to one block for (int i =0; ibLength){ // equal should not happen long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); - // String longer = BinaryBlocksToString(shorterVersion); - // String shorter = BinaryBlocksToString(arrayB); + // String longer = BinaryBlocksToString(shorterVersion); + // String shorter = BinaryBlocksToString(arrayB); // System.out.println("longer: " + longer + " shorter: " + shorter); // if (shorterVersion.length>=2 && arrayB.length >=2) { // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); @@ -1218,2281 +1191,582 @@ private int getRightMarker(long attribute){ return rightMarker; } + + private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ + long maxSubKmerBinary = ~((~0L) << 2 * 31); + long newMarker = oldMarker & maxSubKmerBinary; + newMarker |= ((long) reflexivMarker) << 2*(32-1); + return newMarker; + } } - class RightLongerToShorterComparisonAndNeutralizationPreparation implements MapPartitionsFunction, Serializable{ + class RightLongerKmerVariantAdjustmentAndNeutralization implements MapPartitionsFunction, Serializable{ List kmerList= new ArrayList(); + Row secondLastKmer; + Row lastKmer; public Iterator call(Iterator s) throws Exception { - long[] seqBlocks; - int subKmerLength; - long[] subKmer; - long[] extension=new long[1]; - long attribute; - long[] combinedBlock; - - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp+ "RepeatCheck RightLongerToShorterComparisonAndNeutralizationPreparation: " + param.kmerSize1); - - while (s.hasNext()){ - Row fullKmer=s.next(); + // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); + // System.out.println(timestamp + "RepeatCheck RightLongerKmerVariantAdjustmentAndNeutralization: " + param.kmerSize1); - seqBlocks=seq2array(fullKmer.getSeq(0)); - subKmerLength=currentKmerSizeFromBinaryBlockArray(seqBlocks); + while (s.hasNext()) { + Row fullKmer = s.next(); - subKmer=reverseBinaryBlocks(seqBlocks); - extension[0]=fullKmer.getLong(2); + if (secondLastKmer == null) { + secondLastKmer=fullKmer; + } else if (lastKmer == null){ + lastKmer=fullKmer; + } else { + int currentLength= currentKmerSizeFromBinaryBlockArray(seq2array(fullKmer.getSeq(0))); + int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(lastKmer.getSeq(0))); + int secondLastLength= currentKmerSizeFromBinaryBlockArray(seq2array(secondLastKmer.getSeq(0))); - if (getReflexivMarker(fullKmer.getLong(1))==1){ - combinedBlock=combineTwoLongBlocks(subKmer, extension); - }else{ - combinedBlock=combineTwoLongBlocks(extension, subKmer); - } - - subKmer = leftShiftArray(combinedBlock, 1); - extension = leftShiftOutFromArray(combinedBlock, 1); - - attribute=onlyChangeReflexivMarker(fullKmer.getLong(1), 2); - - kmerList.add(RowFactory.create(subKmer, attribute, extension[0])); - - } - - return kmerList.iterator(); - } - - private long[] reverseBinaryBlocks(long[] blocks){ - int length = currentKmerSizeFromBinaryBlockArray(blocks); - int blockNumber= blocks.length; - long[] newBlocks= new long[blockNumber]; - int reverseIndex; - int reverseBlockIndex; - int relativeReverseIndex; - - int forwardBlockIndex; - - long twoBits; - for (int i=0; i>>2*(31-relativeReverseIndex); - twoBits&=3L; - - newBlocks[forwardBlockIndex]|=twoBits; - newBlocks[forwardBlockIndex] <<=2; - } - int lastBlockShift=31-(length-1)%31-1; - newBlocks[newBlocks.length-1] <<=2*lastBlockShift; - newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); - - return newBlocks; - } - - private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ - long info = (long) ReflexivMarker <<2*(32-1); //move to the left most - - /** - * shorten the int and change negative to positive to avoid two's complementary - */ - if (leftCover>=30000){ - leftCover=30000; - }else if (leftCover<=-30000){ - leftCover=30000-(-30000); - }else if (leftCover<0){ - leftCover=30000-leftCover; - } - - if (rightCover>=30000){ - rightCover=30000; - }else if (rightCover<=-30000){ - rightCover=30000-(-30000); - }else if (rightCover<0){ - rightCover=30000-rightCover; - } - - info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left - info |= ((long) rightCover); // 01--LeftCover---RightCover - - return info; - } - - private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { - int startingBlockIndex = (shiftingLength)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length - - int remainLength=nucleotideLength-shiftingLength-1; - if (remainLength <0){ - remainLength=0; - } - long[] newBlock = new long[remainLength/31+1]; - int relativeShiftSize = shiftingLength % 31; - - if (shiftingLength >= nucleotideLength){ - // apparantly, it is possible. meaning the block has nothing left - // throw new Exception("shifting length longer than the kmer length"); - newBlock[0]|=(1L<<2*31); //add c marker at the end - return newBlock; - } - - // if (relativeShiftSize ==0) then only shifting blocks - - int j=0; // new index for shifted blocks - // long oldShiftOut=0L; // if only one block, then 0 bits -// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex -// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); - // } - for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted - newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- - newBlock[j] |= shiftOut; - newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary - - j++; - } - - if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block - newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; - }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end - newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 - } // else the last block has been completely shift into the new last block, including the C marker - - return newBlock; - - } - - private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ - int relativeShiftSize = shiftingLength % 31; - int endingBlockIndex = (shiftingLength-1)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - long[] shiftOutBlocks = new long[endingBlockIndex+1]; - - if (shiftingLength > nucleotideLength){ - // throw new Exception("shifting length longer than the kmer length"); - return blocks; - } - - for (int i=0; i 0) { - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 - shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); - }else{ // relativeShiftSize == 0; - if (endingBlockIndex+1 == blocks.length) { // a block with C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - }else{ // endingBlockIndex < blocks.length -1 means a block without C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC - } - - } - - return shiftOutBlocks; - } - - private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { - int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); - int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; - int leftVacancy = 31-leftRelativeNTLength; - int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); - int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; - long[] newBlocks= new long[combinedBlockSize]; - - if (rightNucleotideLength==0){ - return leftBlocks; - } - - if (leftNucleotideLength==0){ - return rightBlocks; - } - - if (leftVacancy ==0){ // left last block is a perfect block - for (int i =0; i>> 2*(leftRelativeNTLength)); - if (leftBlocks.lengthbLength){ // equal should not happen - long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); - // String longer = BinaryBlocksToString(shorterVersion); - // String shorter = BinaryBlocksToString(arrayB); - // System.out.println("longer: " + longer + " shorter: " + shorter); - // if (shorterVersion.length>=2 && arrayB.length >=2) { - // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); - //} - if (Arrays.equals(shorterVersion, arrayB)){ - // if (shorterVersion.length>=2){ - // System.out.println("marker!!!"); - // } - return true; - }else{ - return false; - } - }else{ - long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); - if (Arrays.equals(shorterVersion, arrayA)){ - return true; - }else{ - return false; - } - } - } - - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); - } - return array; - } - - private String BinaryBlocksToString (long[] binaryBlocks){ - String KmerString=""; - int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - - for (int i=0; i< KmerLength; i++){ - Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - KmerString += currentNucleotide; - } - - return KmerString; - } - - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0) { - nucleotide = 'A'; - } else if (twoBits == 1) { - nucleotide = 'C'; - } else if (twoBits == 2) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; - - } - - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; - } - - private int getLeftMarker(long attribute){ - int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker - - if (leftMarker>30000){ - leftMarker=30000-leftMarker; - } - - return leftMarker; - } - - private int getRightMarker(long attribute){ - int rightMarker = (int) attribute; - - if (rightMarker>30000){ - rightMarker=30000-rightMarker; - } - - return rightMarker; - } - - private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ - long maxSubKmerBinary = ~((~0L) << 2 * 31); - long newMarker = oldMarker & maxSubKmerBinary; - newMarker |= ((long) reflexivMarker) << 2*(32-1); - return newMarker; - } - } - - class LeftLongerToShorterComparisonPreparation implements MapPartitionsFunction, Serializable{ - List kmerList = new ArrayList(); - - public Iterator call(Iterator s) throws Exception { - long[] seqBlocks; - int kmerLength; - long[] subKmer; - long[] extension; - long attribute; - - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp+ "RepeatCheck LeftLongerToShorterComparisonPreparation: " + param.kmerSize1); - - while (s.hasNext()) { - Row fullKmer = s.next(); - seqBlocks= seq2array(fullKmer.getSeq(0)); - kmerLength=currentKmerSizeFromBinaryBlockArray(seqBlocks); - - subKmer = leftShiftOutFromArray(seqBlocks, kmerLength-1); - extension=leftShiftArray(seqBlocks, kmerLength-1); - subKmer = reverseBinaryBlocks(subKmer); - - attribute=onlyChangeReflexivMarker(fullKmer.getLong(1), 1); - - kmerList.add(RowFactory.create(subKmer, attribute, extension[0])); - } - - return kmerList.iterator(); - } - - private long[] reverseBinaryBlocks(long[] blocks){ - int length = currentKmerSizeFromBinaryBlockArray(blocks); - int blockNumber= blocks.length; - long[] newBlocks= new long[blockNumber]; - int reverseIndex; - int reverseBlockIndex; - int relativeReverseIndex; - - int forwardBlockIndex; - - long twoBits; - for (int i=0; i>>2*(31-relativeReverseIndex); - twoBits&=3L; - - newBlocks[forwardBlockIndex]|=twoBits; - newBlocks[forwardBlockIndex] <<=2; - } - int lastBlockShift=31-(length-1)%31-1; - newBlocks[newBlocks.length-1] <<=2*lastBlockShift; - newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); - - return newBlocks; - } - - private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ - long info = (long) ReflexivMarker <<2*(32-1); //move to the left most - - /** - * shorten the int and change negative to positive to avoid two's complementary - */ - if (leftCover>=30000){ - leftCover=30000; - }else if (leftCover<=-30000){ - leftCover=30000-(-30000); - }else if (leftCover<0){ - leftCover=30000-leftCover; - } - - if (rightCover>=30000){ - rightCover=30000; - }else if (rightCover<=-30000){ - rightCover=30000-(-30000); - }else if (rightCover<0){ - rightCover=30000-rightCover; - } - - info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left - info |= ((long) rightCover); // 01--LeftCover---RightCover - - return info; - } - - private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { - int startingBlockIndex = (shiftingLength)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length - - int remainLength=nucleotideLength-shiftingLength-1; - if (remainLength <0){ - remainLength=0; - } - long[] newBlock = new long[remainLength/31+1]; - int relativeShiftSize = shiftingLength % 31; - - if (shiftingLength >= nucleotideLength){ - // apparantly, it is possible. meaning the block has nothing left - // throw new Exception("shifting length longer than the kmer length"); - newBlock[0]|=(1L<<2*31); //add c marker at the end - return newBlock; - } - - // if (relativeShiftSize ==0) then only shifting blocks - - int j=0; // new index for shifted blocks - // long oldShiftOut=0L; // if only one block, then 0 bits -// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex -// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); - // } - for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted - newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- - newBlock[j] |= shiftOut; - newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary - - j++; - } - - if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block - newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; - }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end - newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 - } // else the last block has been completely shift into the new last block, including the C marker - - return newBlock; - - } - - private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ - int relativeShiftSize = shiftingLength % 31; - int endingBlockIndex = (shiftingLength-1)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - long[] shiftOutBlocks = new long[endingBlockIndex+1]; - - if (shiftingLength > nucleotideLength){ - // throw new Exception("shifting length longer than the kmer length"); - return blocks; - } - - for (int i=0; i 0) { - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 - shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); - }else{ // relativeShiftSize == 0; - if (endingBlockIndex+1 == blocks.length) { // a block with C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - }else{ // endingBlockIndex < blocks.length -1 means a block without C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC - } - - } - - return shiftOutBlocks; - } - - private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { - int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); - int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; - int leftVacancy = 31-leftRelativeNTLength; - int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); - int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; - long[] newBlocks= new long[combinedBlockSize]; - - if (rightNucleotideLength==0){ - return leftBlocks; - } - - if (leftNucleotideLength==0){ - return rightBlocks; - } - - if (leftVacancy ==0){ // left last block is a perfect block - for (int i =0; i>> 2*(leftRelativeNTLength)); - if (leftBlocks.lengthbLength){ // equal should not happen - long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); - // String longer = BinaryBlocksToString(shorterVersion); - // String shorter = BinaryBlocksToString(arrayB); - // System.out.println("longer: " + longer + " shorter: " + shorter); - // if (shorterVersion.length>=2 && arrayB.length >=2) { - // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); - //} - if (Arrays.equals(shorterVersion, arrayB)){ - // if (shorterVersion.length>=2){ - // System.out.println("marker!!!"); - // } - return true; - }else{ - return false; - } - }else{ - long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); - if (Arrays.equals(shorterVersion, arrayA)){ - return true; - }else{ - return false; - } - } - } - - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); - } - return array; - } - - private String BinaryBlocksToString (long[] binaryBlocks){ - String KmerString=""; - int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - - for (int i=0; i< KmerLength; i++){ - Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - KmerString += currentNucleotide; - } - - return KmerString; - } - - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0) { - nucleotide = 'A'; - } else if (twoBits == 1) { - nucleotide = 'C'; - } else if (twoBits == 2) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; - - } - - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; - } - - private int getLeftMarker(long attribute){ - int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker - - if (leftMarker>30000){ - leftMarker=30000-leftMarker; - } - - return leftMarker; - } - - private int getRightMarker(long attribute){ - int rightMarker = (int) attribute; - - if (rightMarker>30000){ - rightMarker=30000-rightMarker; - } - - return rightMarker; - } - - private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ - long maxSubKmerBinary = ~((~0L) << 2 * 31); - long newMarker = oldMarker & maxSubKmerBinary; - newMarker |= ((long) reflexivMarker) << 2*(32-1); - return newMarker; - } - } - - class RightLongerKmerVariantAdjustmentAndNeutralization implements MapPartitionsFunction, Serializable{ - List kmerList= new ArrayList(); - Row secondLastKmer; - Row lastKmer; - - public Iterator call(Iterator s) throws Exception { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp + "RepeatCheck RightLongerKmerVariantAdjustmentAndNeutralization: " + param.kmerSize1); - - while (s.hasNext()) { - Row fullKmer = s.next(); - - if (secondLastKmer == null) { - secondLastKmer=fullKmer; - } else if (lastKmer == null){ - lastKmer=fullKmer; - } else { - int currentLength= currentKmerSizeFromBinaryBlockArray(seq2array(fullKmer.getSeq(0))); - int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(lastKmer.getSeq(0))); - int secondLastLength= currentKmerSizeFromBinaryBlockArray(seq2array(secondLastKmer.getSeq(0))); - - // ----- - // ? - // ? - if (secondLastLength==param.kmerSize1-1){ //shorter - // ----- - // ----- - // ? - if (lastLength==param.kmerSize1-1){ // another shorter - // kmerList.add(secondLastKmer); - // kmerList.add(lastKmer); - - // ----- - // ----- - // ----- - if (currentLength==param.kmerSize1-1) { - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - secondLastKmer=fullKmer; - lastKmer=null; - } - // ----- - // ----- - // -------- - else{ - - if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ - /* - long attribute=fullKmer.getLong(1); - if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(fullKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), -1, getRightMarker(fullKmer.getLong(1))); - } - - if (lastKmer.getLong(2) == fullKmer.getLong(2)){ - fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); - }else{ - fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } -*/ - kmerList.add(secondLastKmer); - - secondLastKmer=lastKmer; - lastKmer=fullKmer; - }else { - - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - } - } - } - // ----- - // -------- - // ? - else{ // longer - // ----- - // -------- - // ----- - if (currentLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ - long attribute=lastKmer.getLong(1); - if (getLeftMarker(secondLastKmer.getLong(1))<0 && getLeftMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), -1, getRightMarker(lastKmer.getLong(1))); - } - - if (secondLastKmer.getLong(2)== lastKmer.getLong(2)){ - lastKmer = RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); - }else{ - lastKmer= RowFactory.create(lastKmer.getSeq(0), attribute, secondLastKmer.getLong(2)); - } - - // kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer= fullKmer; - lastKmer=null; - }else if(dynamicSubKmerComparator(fullKmer.getSeq(0), lastKmer.getSeq(0))){ // need to be decided together with the next in coming k-mer - kmerList.add(secondLastKmer); - - secondLastKmer= lastKmer; - lastKmer=fullKmer; - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - - } - } - // ----- - // -------- - // -------- - else{ - if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0), secondLastKmer.getSeq(0))){ - /* - long attribute=lastKmer.getLong(1); - if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); - } - - lastKmer=RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); - - if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(fullKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), getLeftMarker(fullKmer.getLong(1)), -1); - } - - fullKmer = RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); -*/ - if (secondLastKmer.getLong(2) == lastKmer.getLong(2) || secondLastKmer.getLong(2) ==fullKmer.getLong(2)){ - kmerList.add(lastKmer); - kmerList.add(fullKmer); - }else { - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - kmerList.add(fullKmer); - } - - secondLastKmer=null; - lastKmer=null; - }else if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ - long attribute=lastKmer.getLong(1); - if (getLeftMarker(secondLastKmer.getLong(1))<0 && getLeftMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), -1, getRightMarker(lastKmer.getLong(1))); - } - - if (lastKmer.getLong(2) == secondLastKmer.getLong(2)){ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - }else{ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - } - - // kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - secondLastKmer=fullKmer; - lastKmer=null; - }else{ // the two long kmer will be decided together with the next incoming k-mer - kmerList.add(secondLastKmer); - - secondLastKmer = lastKmer; - lastKmer= fullKmer; - } - } - } - } - // -------- - // ? - // ? - else{ - // -------- - // ----- - // ? - if (lastLength==param.kmerSize1-1){ - // -------- - // ----- - // ----- - if (currentLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=secondLastKmer.getLong(1); - if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(secondLastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), -1, getRightMarker(secondLastKmer.getLong(1))); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - }else{ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - // kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - } - } - // -------- - // ----- - // -------- - else{ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0)) ){ - if (secondLastKmer.getLong(2) == lastKmer.getLong(2) || lastKmer.getLong(2) ==fullKmer.getLong(2)){ - kmerList.add(secondLastKmer); - kmerList.add(fullKmer); - }else { - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - kmerList.add(fullKmer); - } - - secondLastKmer=null; - lastKmer=null; - }else if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=secondLastKmer.getLong(1); - if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(secondLastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), -1, getRightMarker(secondLastKmer.getLong(1))); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - }else{ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - // kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - }else if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ // the last two need to be decided together with the incoming k-mer - - kmerList.add(secondLastKmer); - secondLastKmer= lastKmer; - lastKmer=fullKmer; - - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - } - } - } - // -------- - // -------- - // ? - else{ - // -------- - // -------- - // ----- - if (currentLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),fullKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0)) ){ - if (secondLastKmer.getLong(2) == fullKmer.getLong(2) || lastKmer.getLong(2) ==fullKmer.getLong(2)){ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - }else { - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - kmerList.add(fullKmer); - } - - secondLastKmer=null; - lastKmer=null; - }else if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ // the last two need to be decided together with the incoming kmer - kmerList.add(secondLastKmer); - secondLastKmer=lastKmer; - lastKmer=fullKmer; - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - } - } - // -------- - // -------- - // -------- - else{ // the last two need to be decided together with the incoming k-mer - kmerList.add(secondLastKmer); - - secondLastKmer=lastKmer; - lastKmer = fullKmer; - } - } - } - } - } - - if (secondLastKmer!=null){ - if (lastKmer!=null){ - int secondLastLength = currentKmerSizeFromBinaryBlockArray(seq2array(secondLastKmer.getSeq(0))); - int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(lastKmer.getSeq(0))); - // ----- - // ? - if (secondLastLength==param.kmerSize1-1){ - // ----- - // ----- - if (lastLength==param.kmerSize1-1){ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - } - // ----- - // -------- - else{ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=lastKmer.getLong(1); - if (getLeftMarker(secondLastKmer.getLong(1))<0 && getLeftMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), -1, getRightMarker(lastKmer.getLong(1))); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - }else{ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - } - - // kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - } - } - } - // -------- - // ? - else{ - // -------- - // ----- - if (lastLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=secondLastKmer.getLong(1); - if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(secondLastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), -1, getRightMarker(secondLastKmer.getLong(1))); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - }else{ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - // kmerList.add(lastKmer); - } - } - // -------- - // -------- - else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - } - } - }else{ - kmerList.add(secondLastKmer); - } - }else if (lastKmer !=null){ - kmerList.add(lastKmer); - } - - return kmerList.iterator(); - } - - private long[] reverseBinaryBlocks(long[] blocks){ - int length = currentKmerSizeFromBinaryBlockArray(blocks); - int blockNumber= blocks.length; - long[] newBlocks= new long[blockNumber]; - int reverseIndex; - int reverseBlockIndex; - int relativeReverseIndex; - - int forwardBlockIndex; - - long twoBits; - for (int i=0; i>>2*(31-relativeReverseIndex); - twoBits&=3L; - - newBlocks[forwardBlockIndex]|=twoBits; - newBlocks[forwardBlockIndex] <<=2; - } - int lastBlockShift=31-(length-1)%31-1; - newBlocks[newBlocks.length-1] <<=2*lastBlockShift; - newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); - - return newBlocks; - } - - private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ - long info = (long) ReflexivMarker <<2*(32-1); //move to the left most - - /** - * shorten the int and change negative to positive to avoid two's complementary - */ - if (leftCover>=30000){ - leftCover=30000; - }else if (leftCover<=-30000){ - leftCover=30000-(-30000); - }else if (leftCover<0){ - leftCover=30000-leftCover; - } - - if (rightCover>=30000){ - rightCover=30000; - }else if (rightCover<=-30000){ - rightCover=30000-(-30000); - }else if (rightCover<0){ - rightCover=30000-rightCover; - } - - info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left - info |= ((long) rightCover); // 01--LeftCover---RightCover - - return info; - } - - private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { - int startingBlockIndex = (shiftingLength)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length - - int remainLength=nucleotideLength-shiftingLength-1; - if (remainLength <0){ - remainLength=0; - } - long[] newBlock = new long[remainLength/31+1]; - int relativeShiftSize = shiftingLength % 31; - - if (shiftingLength >= nucleotideLength){ - // apparantly, it is possible. meaning the block has nothing left - // throw new Exception("shifting length longer than the kmer length"); - newBlock[0]|=(1L<<2*31); //add c marker at the end - return newBlock; - } - - // if (relativeShiftSize ==0) then only shifting blocks - - int j=0; // new index for shifted blocks - // long oldShiftOut=0L; // if only one block, then 0 bits -// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex -// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); - // } - for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted - newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- - newBlock[j] |= shiftOut; - newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary - - j++; - } - - if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block - newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; - }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end - newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 - } // else the last block has been completely shift into the new last block, including the C marker - - return newBlock; - - } - - private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ - int relativeShiftSize = shiftingLength % 31; - int endingBlockIndex = (shiftingLength-1)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - long[] shiftOutBlocks = new long[endingBlockIndex+1]; - - if (shiftingLength > nucleotideLength){ - // throw new Exception("shifting length longer than the kmer length"); - return blocks; - } - - for (int i=0; i 0) { - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 - shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); - }else{ // relativeShiftSize == 0; - if (endingBlockIndex+1 == blocks.length) { // a block with C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - }else{ // endingBlockIndex < blocks.length -1 means a block without C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC - } - - } - - return shiftOutBlocks; - } - - private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { - int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); - int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; - int leftVacancy = 31-leftRelativeNTLength; - int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); - int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; - long[] newBlocks= new long[combinedBlockSize]; - - if (rightNucleotideLength==0){ - return leftBlocks; - } - - if (leftNucleotideLength==0){ - return rightBlocks; - } - - if (leftVacancy ==0){ // left last block is a perfect block - for (int i =0; i>> 2*(leftRelativeNTLength)); - if (leftBlocks.lengthbLength){ // equal should not happen - long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); - // String longer = BinaryBlocksToString(shorterVersion); - // String shorter = BinaryBlocksToString(arrayB); - // System.out.println("longer: " + longer + " shorter: " + shorter); - // if (shorterVersion.length>=2 && arrayB.length >=2) { - // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); - //} - if (Arrays.equals(shorterVersion, arrayB)){ - // if (shorterVersion.length>=2){ - // System.out.println("marker!!!"); - // } - return true; - }else{ - return false; - } - }else{ - long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); - if (Arrays.equals(shorterVersion, arrayA)){ - return true; - }else{ - return false; - } - } - } - - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); - } - return array; - } - - private String BinaryBlocksToString (long[] binaryBlocks){ - String KmerString=""; - int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - - for (int i=0; i< KmerLength; i++){ - Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - KmerString += currentNucleotide; - } - - return KmerString; - } - - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0) { - nucleotide = 'A'; - } else if (twoBits == 1) { - nucleotide = 'C'; - } else if (twoBits == 2) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; - - } - - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; - } - - private int getLeftMarker(long attribute){ - int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker - - if (leftMarker>30000){ - leftMarker=30000-leftMarker; - } - - return leftMarker; - } - - private int getRightMarker(long attribute){ - int rightMarker = (int) attribute; - - if (rightMarker>30000){ - rightMarker=30000-rightMarker; - } - - return rightMarker; - } - - private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ - long maxSubKmerBinary = ~((~0L) << 2 * 31); - long newMarker = oldMarker & maxSubKmerBinary; - newMarker |= ((long) reflexivMarker) << 2*(32-1); - return newMarker; - } - } - - class LeftLongerKmerVariantAdjustment implements MapPartitionsFunction, Serializable{ - List kmerList = new ArrayList(); - Row secondLastKmer; - Row lastKmer; - - public Iterator call(Iterator s) throws Exception { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp+ "RepeatCheck LeftLongerKmerVariantAdjustment: " + param.kmerSize1); - - while (s.hasNext()) { - Row fullKmer = s.next(); - - if (secondLastKmer == null) { - secondLastKmer=fullKmer; - } else if (lastKmer == null){ - lastKmer=fullKmer; - } else { - int currentLength= currentKmerSizeFromBinaryBlockArray(seq2array(fullKmer.getSeq(0))); - int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(lastKmer.getSeq(0))); - int secondLastLength= currentKmerSizeFromBinaryBlockArray(seq2array(secondLastKmer.getSeq(0))); - - // ----- - // ? - // ? - if (secondLastLength==param.kmerSize1-1){ //shorter - // ----- - // ----- - // ? - if (lastLength==param.kmerSize1-1){ // another shorter - // kmerList.add(secondLastKmer); - // kmerList.add(lastKmer); - - // ----- - // ----- - // ----- - if (currentLength==param.kmerSize1-1) { - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - secondLastKmer=fullKmer; - lastKmer=null; - } - // ----- - // ----- - // -------- - else{ - - if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ - /* - long attribute=fullKmer.getLong(1); - if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(fullKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), getLeftMarker(fullKmer.getLong(1)), -1); - } - - if (lastKmer.getLong(2) == fullKmer.getLong(2)){ - fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); - }else{ - fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } -*/ - kmerList.add(secondLastKmer); - - secondLastKmer=lastKmer; - lastKmer=fullKmer; - }else { - - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - } - } - } - // ----- - // -------- - // ? - else{ // longer - // ----- - // -------- - // ----- - if (currentLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ - long attribute=lastKmer.getLong(1); - if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); - } - - if (secondLastKmer.getLong(2)== lastKmer.getLong(2)){ - lastKmer = RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); - }else{ - lastKmer= RowFactory.create(lastKmer.getSeq(0), attribute, secondLastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer= fullKmer; - lastKmer=null; - }else if(dynamicSubKmerComparator(fullKmer.getSeq(0), lastKmer.getSeq(0))){ // need to be decided together with the next in coming k-mer - kmerList.add(secondLastKmer); - - secondLastKmer= lastKmer; - lastKmer=fullKmer; - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - - } - } - // ----- - // -------- - // -------- - else{ - if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0), secondLastKmer.getSeq(0))){ - /* - long attribute=lastKmer.getLong(1); - if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); - } - - lastKmer=RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); - - if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(fullKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), getLeftMarker(fullKmer.getLong(1)), -1); - } - - fullKmer = RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); -*/ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - kmerList.add(fullKmer); - - secondLastKmer=null; - lastKmer=null; - }else if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ - long attribute=lastKmer.getLong(1); - if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); - } - - if (lastKmer.getLong(2) == secondLastKmer.getLong(2)){ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - }else{ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - secondLastKmer=fullKmer; - lastKmer=null; - }else{ // the two long kmer will be decided together with the next incoming k-mer - kmerList.add(secondLastKmer); - - secondLastKmer = lastKmer; - lastKmer= fullKmer; - } - } - } - } - // -------- + // ----- // ? // ? - else{ - // -------- + if (secondLastLength==param.kmerSize1-1){ //shorter + // ----- // ----- // ? - if (lastLength==param.kmerSize1-1){ - // -------- - // ----- - // ----- - if (currentLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=secondLastKmer.getLong(1); - if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(secondLastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), getLeftMarker(secondLastKmer.getLong(1)), -1); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - }else{ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); + if (lastLength==param.kmerSize1-1){ // another shorter + // kmerList.add(secondLastKmer); + // kmerList.add(lastKmer); - secondLastKmer=fullKmer; - lastKmer=null; - } - } - // -------- // ----- - // -------- - else{ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0)) ){ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - kmerList.add(fullKmer); - - secondLastKmer=null; - lastKmer=null; - }else if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=secondLastKmer.getLong(1); - if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(secondLastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), getLeftMarker(secondLastKmer.getLong(1)), -1); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - }else{ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - }else if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ // the last two need to be decided together with the incoming k-mer - - kmerList.add(secondLastKmer); - secondLastKmer= lastKmer; - lastKmer=fullKmer; - - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - } - } - } - // -------- - // -------- - // ? - else{ - // -------- - // -------- // ----- - if (currentLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),fullKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0)) ){ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - kmerList.add(fullKmer); - - secondLastKmer=null; - lastKmer=null; - }else if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ // the last two need to be decided together with the incoming kmer - kmerList.add(secondLastKmer); - secondLastKmer=lastKmer; - lastKmer=fullKmer; - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - - secondLastKmer=fullKmer; - lastKmer=null; - } - } - // -------- - // -------- - // -------- - else{ // the last two need to be decided together with the incoming k-mer - kmerList.add(secondLastKmer); - - secondLastKmer=lastKmer; - lastKmer = fullKmer; - } - } - } - } - } - - if (secondLastKmer!=null){ - if (lastKmer!=null){ - int secondLastLength = currentKmerSizeFromBinaryBlockArray(seq2array(secondLastKmer.getSeq(0))); - int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(lastKmer.getSeq(0))); - // ----- - // ? - if (secondLastLength==param.kmerSize1-1){ - // ----- - // ----- - if (lastLength==param.kmerSize1-1){ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - } - // ----- - // -------- - else{ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=lastKmer.getLong(1); - if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - }else{ - lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - } - - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - }else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - } - } - } - // -------- - // ? - else{ - // -------- - // ----- - if (lastLength==param.kmerSize1-1){ - if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ - long attribute=secondLastKmer.getLong(1); - if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(secondLastKmer.getLong(1))>=0){ - attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), getLeftMarker(secondLastKmer.getLong(1)), -1); - } - - if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); - }else{ - secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); - } - + // ----- + if (currentLength==param.kmerSize1-1) { kmerList.add(secondLastKmer); kmerList.add(lastKmer); + secondLastKmer=fullKmer; + lastKmer=null; } - } - // -------- - // -------- - else{ - kmerList.add(secondLastKmer); - kmerList.add(lastKmer); - } - } - }else{ - kmerList.add(secondLastKmer); - } - }else if (lastKmer !=null){ - kmerList.add(lastKmer); - } - - return kmerList.iterator(); - } - - private long[] reverseBinaryBlocks(long[] blocks){ - int length = currentKmerSizeFromBinaryBlockArray(blocks); - int blockNumber= blocks.length; - long[] newBlocks= new long[blockNumber]; - int reverseIndex; - int reverseBlockIndex; - int relativeReverseIndex; - - int forwardBlockIndex; - - long twoBits; - for (int i=0; i>>2*(31-relativeReverseIndex); - twoBits&=3L; - - newBlocks[forwardBlockIndex]|=twoBits; - newBlocks[forwardBlockIndex] <<=2; - } - int lastBlockShift=31-(length-1)%31-1; - newBlocks[newBlocks.length-1] <<=2*lastBlockShift; - newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); - - return newBlocks; - } + // ----- + // ----- + // -------- + else{ - private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ - long info = (long) ReflexivMarker <<2*(32-1); //move to the left most + if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ + /* + long attribute=fullKmer.getLong(1); + if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(fullKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), -1, getRightMarker(fullKmer.getLong(1))); + } - /** - * shorten the int and change negative to positive to avoid two's complementary - */ - if (leftCover>=30000){ - leftCover=30000; - }else if (leftCover<=-30000){ - leftCover=30000-(-30000); - }else if (leftCover<0){ - leftCover=30000-leftCover; - } + if (lastKmer.getLong(2) == fullKmer.getLong(2)){ + fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); + }else{ + fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, lastKmer.getLong(2)); + } +*/ + kmerList.add(secondLastKmer); - if (rightCover>=30000){ - rightCover=30000; - }else if (rightCover<=-30000){ - rightCover=30000-(-30000); - }else if (rightCover<0){ - rightCover=30000-rightCover; - } + secondLastKmer=lastKmer; + lastKmer=fullKmer; + }else { - info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left - info |= ((long) rightCover); // 01--LeftCover---RightCover + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - return info; - } + secondLastKmer=fullKmer; + lastKmer=null; + } + } + } + // ----- + // -------- + // ? + else{ // longer + // ----- + // -------- + // ----- + if (currentLength==param.kmerSize1-1){ + if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ + long attribute=lastKmer.getLong(1); + if (getLeftMarker(secondLastKmer.getLong(1))<0 && getLeftMarker(lastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), -1, getRightMarker(lastKmer.getLong(1))); + } - private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { - int startingBlockIndex = (shiftingLength)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length + if (secondLastKmer.getLong(2)== lastKmer.getLong(2)){ + lastKmer = RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); + }else{ + lastKmer= RowFactory.create(lastKmer.getSeq(0), attribute, secondLastKmer.getLong(2)); + } - int remainLength=nucleotideLength-shiftingLength-1; - if (remainLength <0){ - remainLength=0; - } - long[] newBlock = new long[remainLength/31+1]; - int relativeShiftSize = shiftingLength % 31; + // kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - if (shiftingLength >= nucleotideLength){ - // apparantly, it is possible. meaning the block has nothing left - // throw new Exception("shifting length longer than the kmer length"); - newBlock[0]|=(1L<<2*31); //add c marker at the end - return newBlock; - } + secondLastKmer= fullKmer; + lastKmer=null; + }else if(dynamicSubKmerComparator(fullKmer.getSeq(0), lastKmer.getSeq(0))){ // need to be decided together with the next in coming k-mer + kmerList.add(secondLastKmer); - // if (relativeShiftSize ==0) then only shifting blocks + secondLastKmer= lastKmer; + lastKmer=fullKmer; + }else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - int j=0; // new index for shifted blocks - // long oldShiftOut=0L; // if only one block, then 0 bits -// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex -// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); - // } - for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted - newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- - newBlock[j] |= shiftOut; - newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary + secondLastKmer=fullKmer; + lastKmer=null; - j++; - } + } + } + // ----- + // -------- + // -------- + else{ + if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0), secondLastKmer.getSeq(0))){ + /* + long attribute=lastKmer.getLong(1); + if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); + } - if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block - newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; - }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end - newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 - } // else the last block has been completely shift into the new last block, including the C marker + lastKmer=RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); - return newBlock; + if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(fullKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), getLeftMarker(fullKmer.getLong(1)), -1); + } - } + fullKmer = RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); +*/ + if (secondLastKmer.getLong(2) == lastKmer.getLong(2) || secondLastKmer.getLong(2) ==fullKmer.getLong(2)){ + kmerList.add(lastKmer); + kmerList.add(fullKmer); + }else { + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + kmerList.add(fullKmer); + } - private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ - int relativeShiftSize = shiftingLength % 31; - int endingBlockIndex = (shiftingLength-1)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - long[] shiftOutBlocks = new long[endingBlockIndex+1]; + secondLastKmer=null; + lastKmer=null; + }else if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ + long attribute=lastKmer.getLong(1); + if (getLeftMarker(secondLastKmer.getLong(1))<0 && getLeftMarker(lastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), -1, getRightMarker(lastKmer.getLong(1))); + } - if (shiftingLength > nucleotideLength){ - // throw new Exception("shifting length longer than the kmer length"); - return blocks; - } + if (lastKmer.getLong(2) == secondLastKmer.getLong(2)){ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + }else{ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + } - for (int i=0; i 0) { - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 - shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); - }else{ // relativeShiftSize == 0; - if (endingBlockIndex+1 == blocks.length) { // a block with C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - }else{ // endingBlockIndex < blocks.length -1 means a block without C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC - } + secondLastKmer = lastKmer; + lastKmer= fullKmer; + } + } + } + } + // -------- + // ? + // ? + else{ + // -------- + // ----- + // ? + if (lastLength==param.kmerSize1-1){ + // -------- + // ----- + // ----- + if (currentLength==param.kmerSize1-1){ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ + long attribute=secondLastKmer.getLong(1); + if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(secondLastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), -1, getRightMarker(secondLastKmer.getLong(1))); + } - } + if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + }else{ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + } - return shiftOutBlocks; - } + kmerList.add(secondLastKmer); + // kmerList.add(lastKmer); - private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { - int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); - int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; - int leftVacancy = 31-leftRelativeNTLength; - int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); - int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; - long[] newBlocks= new long[combinedBlockSize]; + secondLastKmer=fullKmer; + lastKmer=null; + }else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - if (rightNucleotideLength==0){ - return leftBlocks; - } + secondLastKmer=fullKmer; + lastKmer=null; + } + } + // -------- + // ----- + // -------- + else{ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0)) ){ + if (secondLastKmer.getLong(2) == lastKmer.getLong(2) || lastKmer.getLong(2) ==fullKmer.getLong(2)){ + kmerList.add(secondLastKmer); + kmerList.add(fullKmer); + }else { + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + kmerList.add(fullKmer); + } - if (leftNucleotideLength==0){ - return rightBlocks; - } + secondLastKmer=null; + lastKmer=null; + }else if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ + long attribute=secondLastKmer.getLong(1); + if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(secondLastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), -1, getRightMarker(secondLastKmer.getLong(1))); + } - if (leftVacancy ==0){ // left last block is a perfect block - for (int i =0; i>> 2*(leftRelativeNTLength)); - if (leftBlocks.length=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), -1, getRightMarker(lastKmer.getLong(1))); + } - } + if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + }else{ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + } - private boolean dynamicSubKmerComparator(Seq a, Seq b) throws Exception { - long[] arrayA = seq2array(a); - long[] arrayB = seq2array(b); + // kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + }else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + } + } + } + // -------- + // ? + else{ + // -------- + // ----- + if (lastLength==param.kmerSize1-1){ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ + long attribute=secondLastKmer.getLong(1); + if (getLeftMarker(lastKmer.getLong(1))<0 && getLeftMarker(secondLastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), -1, getRightMarker(secondLastKmer.getLong(1))); + } - int aLength= currentKmerSizeFromBinaryBlockArray(arrayA); - int bLength= currentKmerSizeFromBinaryBlockArray(arrayB); + if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + }else{ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + } - if (aLength>bLength){ // equal should not happen - long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); - // String longer = BinaryBlocksToString(shorterVersion); - // String shorter = BinaryBlocksToString(arrayB); - // System.out.println("longer: " + longer + " shorter: " + shorter); - // if (shorterVersion.length>=2 && arrayB.length >=2) { - // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); - //} - if (Arrays.equals(shorterVersion, arrayB)){ - // if (shorterVersion.length>=2){ - // System.out.println("marker!!!"); - // } - return true; - }else{ - return false; - } - }else{ - long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); - if (Arrays.equals(shorterVersion, arrayA)){ - return true; + kmerList.add(secondLastKmer); + // kmerList.add(lastKmer); + } + } + // -------- + // -------- + else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + } + } }else{ - return false; + kmerList.add(secondLastKmer); } - } - } - - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); - } - return array; - } - - private String BinaryBlocksToString (long[] binaryBlocks){ - String KmerString=""; - int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - - for (int i=0; i< KmerLength; i++){ - Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - KmerString += currentNucleotide; + }else if (lastKmer !=null){ + kmerList.add(lastKmer); } - return KmerString; + return kmerList.iterator(); } - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0) { - nucleotide = 'A'; - } else if (twoBits == 1) { - nucleotide = 'C'; - } else if (twoBits == 2) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; + private long[] reverseBinaryBlocks(long[] blocks){ + int length = currentKmerSizeFromBinaryBlockArray(blocks); + int blockNumber= blocks.length; + long[] newBlocks= new long[blockNumber]; + int reverseIndex; + int reverseBlockIndex; + int relativeReverseIndex; - } + int forwardBlockIndex; - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; - } + long twoBits; + for (int i=0; i>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + forwardBlockIndex=i/31; - if (leftMarker>30000){ - leftMarker=30000-leftMarker; + twoBits=blocks[reverseBlockIndex] >>>2*(31-relativeReverseIndex); + twoBits&=3L; + + newBlocks[forwardBlockIndex]|=twoBits; + newBlocks[forwardBlockIndex] <<=2; } + int lastBlockShift=31-(length-1)%31-1; + newBlocks[newBlocks.length-1] <<=2*lastBlockShift; + newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); - return leftMarker; + return newBlocks; } - private int getRightMarker(long attribute){ - int rightMarker = (int) attribute; + private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ + long info = (long) ReflexivMarker <<2*(32-1); //move to the left most - if (rightMarker>30000){ - rightMarker=30000-rightMarker; + /** + * shorten the int and change negative to positive to avoid two's complementary + */ + if (leftCover>=30000){ + leftCover=30000; + }else if (leftCover<=-30000){ + leftCover=30000-(-30000); + }else if (leftCover<0){ + leftCover=30000-leftCover; } - return rightMarker; - } + if (rightCover>=30000){ + rightCover=30000; + }else if (rightCover<=-30000){ + rightCover=30000-(-30000); + }else if (rightCover<0){ + rightCover=30000-rightCover; + } - private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ - long maxSubKmerBinary = ~((~0L) << 2 * 31); - long newMarker = oldMarker & maxSubKmerBinary; - newMarker |= ((long) reflexivMarker) << 2*(32-1); - return newMarker; + info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left + info |= ((long) rightCover); // 01--LeftCover---RightCover + + return info; } - } - /** - * - */ - class ShorterKmerNeutralization implements MapPartitionsFunction, Serializable{ - List LongerFullKmer = new ArrayList(); - List newFullKmerList = new ArrayList(); - Row shorterFullKmer; - List tempLongerFullKmer = new ArrayList(); - boolean neutralizeMarker = false; + private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { + int startingBlockIndex = (shiftingLength)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length - public Iterator call(Iterator s) throws Exception { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp + "RepeatCheck ShorterKmerNeutralization: " + param.kmerSize1); + int remainLength=nucleotideLength-shiftingLength-1; + if (remainLength <0){ + remainLength=0; + } + long[] newBlock = new long[remainLength/31+1]; + int relativeShiftSize = shiftingLength % 31; + if (shiftingLength >= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } - while (s.hasNext()) { - Row fullKmer = s.next(); + // if (relativeShiftSize ==0) then only shifting blocks - // String fullKmerString = BinaryBlocksToString(seq2array(FullKmer.getSeq(0))); - // System.out.println("neutralization kmer: " + fullKmerString); -/* - long[] FullKmerArray = seq2array(FullKmer.getSeq(0)); - int FullKmerLength = currentKmerSizeFromBinaryBlockArray(FullKmerArray); - if (FullKmerLength == param.kmerSize1) { // shorter FullKmer size = param.kmerSize1 -1 - if (shorterFullKmer != null) { // already one exists - if (tempLongerFullKmer.size() > 0) { - if (getLeftMarker(shorterFullKmer.getLong(1)) > 0) { - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { - neutralizeMarker=true; - newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), true)); - } else { - newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); - } - } - } else { // adding temp to output without changing - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { - neutralizeMarker=true; - } - newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); - } - } - tempLongerFullKmer=new ArrayList(); - } - } + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary - if(neutralizeMarker==true){ - neutralizeMarker=false; // reset marker - }else{ - if (shorterFullKmer!=null) { - newFullKmerList.add(shorterFullKmer); - } - } - shorterFullKmer = FullKmer; - // newFullKmerList.add(FullKmer); - } else { // it is a longer K-mer - if (shorterFullKmer == null) { - tempLongerFullKmer.add(FullKmer); - } else { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), FullKmer.getSeq(0)) == true) { - neutralizeMarker=true; - if (getLeftMarker(shorterFullKmer.getLong(1)) > 0) { - newFullKmerList.add(enlighteningLeft(FullKmer, true)); - } else { - newFullKmerList.add(enlighteningLeft(FullKmer, false)); - } - } else { // longer Kmer not overlap to shorter k-mer anymore, a new round starts - if (getLeftMarker(shorterFullKmer.getLong(1)) > 0) { - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { - neutralizeMarker=true; - newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), true)); - } else { - newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); - } - } - } else { // adding temp to output without changing + j++; + } - for (int i = 0; i < tempLongerFullKmer.size(); i++) { - if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { - neutralizeMarker=true; - } - newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); - } - } + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker - if (neutralizeMarker==true){ - neutralizeMarker=false; - }else{ - newFullKmerList.add(shorterFullKmer); - } + return newBlock; - tempLongerFullKmer= new ArrayList(); - shorterFullKmer = null; - tempLongerFullKmer.add(FullKmer); - } - } - } -*/ + } - if (LongerFullKmer.size() == 0) { - LongerFullKmer.add( - RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) - ); - } else { - int currentLength= currentKmerSizeFromBinaryBlockArray(seq2array(fullKmer.getSeq(0))); - int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(LongerFullKmer.get(LongerFullKmer.size() - 1).getSeq(0))); - if ( currentLength== lastLength ){ // two kmer with equal size - LongerFullKmer.add( - RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) - ); - } else if (dynamicSubKmerComparator(fullKmer.getSeq(0), LongerFullKmer.get(LongerFullKmer.size() - 1).getSeq(0)) == true) { - long[] lastKmer = seq2array(LongerFullKmer.get(LongerFullKmer.size() - 1).getSeq(0)); - long[] currentKmer = seq2array(fullKmer.getSeq(0)); + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; - int lastKmerLength = currentKmerSizeFromBinaryBlockArray(lastKmer); - int currentKmerLength = currentKmerSizeFromBinaryBlockArray(currentKmer); + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; + } - if (lastKmerLength >currentKmerLength){ - continue; - }else{ - LongerFullKmer.remove(LongerFullKmer.size() - 1); - LongerFullKmer.add( - RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) - ); - } - } else { - LongerFullKmer.add( - RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) - ); - } + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC } } -/* - if (shorterFullKmer==null){ - for (int i=0;i0){ - for (int i=0;i=30000){ - leftCover=30000; - }else if (leftCover<=-30000){ - leftCover=30000-(-30000); - }else if (leftCover<0){ - leftCover=30000-leftCover; - } + newBlocks[leftBlocks.length-1] &= (~0L<<2*(leftVacancy+1)); // leftVacancy = 32-leftRelativeNTLength-1. This is to remove the C marker + newBlocks[leftBlocks.length-1] |= (shiftOutBlocks[0]>>> 2*(leftRelativeNTLength)); + if (leftBlocks.length=30000){ - rightCover=30000; - }else if (rightCover<=-30000){ - rightCover=30000-(-30000); - }else if (rightCover<0){ - rightCover=30000-rightCover; - } + long[] rightBlocksLeftShifted = leftShiftArray(rightBlocks, leftVacancy); - info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left - info |= ((long) rightCover); // 01--LeftCover---RightCover + int k=0; // rightBlocksLeftShifted index + for (int j=leftBlocks.length;jbLength){ // equal should not happen long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); - // String longer = BinaryBlocksToString(shorterVersion); - // String shorter = BinaryBlocksToString(arrayB); - // System.out.println("longer: " + longer + " shorter: " + shorter); - // if (shorterVersion.length>=2 && arrayB.length >=2) { + // String longer = BinaryBlocksToString(shorterVersion); + // String shorter = BinaryBlocksToString(arrayB); + // System.out.println("longer: " + longer + " shorter: " + shorter); + // if (shorterVersion.length>=2 && arrayB.length >=2) { // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); //} if (Arrays.equals(shorterVersion, arrayB)){ - // if (shorterVersion.length>=2){ - // System.out.println("marker!!!"); - // } + // if (shorterVersion.length>=2){ + // System.out.println("marker!!!"); + // } return true; }else{ return false; @@ -3540,322 +1814,465 @@ private boolean dynamicSubKmerComparator(Seq a, Seq b) throws Exception { } } - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); - } - return array; - } + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + KmerString += currentNucleotide; + } + + return KmerString; + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + + } + + private int getReflexivMarker(long attribute){ + int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker + return reflexivMarker; + } + + private int getLeftMarker(long attribute){ + int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker + int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 + leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + + if (leftMarker>30000){ + leftMarker=30000-leftMarker; + } + + return leftMarker; + } + + private int getRightMarker(long attribute){ + int rightMarker = (int) attribute; + + if (rightMarker>30000){ + rightMarker=30000-rightMarker; + } + + return rightMarker; + } + + private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ + long maxSubKmerBinary = ~((~0L) << 2 * 31); + long newMarker = oldMarker & maxSubKmerBinary; + newMarker |= ((long) reflexivMarker) << 2*(32-1); + return newMarker; + } + } + + class LeftLongerKmerVariantAdjustment implements MapPartitionsFunction, Serializable{ + List kmerList = new ArrayList(); + Row secondLastKmer; + Row lastKmer; + + public Iterator call(Iterator s) throws Exception { + // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); + // System.out.println(timestamp+ "RepeatCheck LeftLongerKmerVariantAdjustment: " + param.kmerSize1); + + while (s.hasNext()) { + Row fullKmer = s.next(); + + if (secondLastKmer == null) { + secondLastKmer=fullKmer; + } else if (lastKmer == null){ + lastKmer=fullKmer; + } else { + int currentLength= currentKmerSizeFromBinaryBlockArray(seq2array(fullKmer.getSeq(0))); + int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(lastKmer.getSeq(0))); + int secondLastLength= currentKmerSizeFromBinaryBlockArray(seq2array(secondLastKmer.getSeq(0))); + + // ----- + // ? + // ? + if (secondLastLength==param.kmerSize1-1){ //shorter + // ----- + // ----- + // ? + if (lastLength==param.kmerSize1-1){ // another shorter + // kmerList.add(secondLastKmer); + // kmerList.add(lastKmer); + + // ----- + // ----- + // ----- + if (currentLength==param.kmerSize1-1) { + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + secondLastKmer=fullKmer; + lastKmer=null; + } + // ----- + // ----- + // -------- + else{ + + if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ + /* + long attribute=fullKmer.getLong(1); + if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(fullKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), getLeftMarker(fullKmer.getLong(1)), -1); + } + + if (lastKmer.getLong(2) == fullKmer.getLong(2)){ + fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); + }else{ + fullKmer=RowFactory.create(fullKmer.getSeq(0),attribute, lastKmer.getLong(2)); + } +*/ + kmerList.add(secondLastKmer); + + secondLastKmer=lastKmer; + lastKmer=fullKmer; + }else { + + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + + secondLastKmer=fullKmer; + lastKmer=null; + } + } + } + // ----- + // -------- + // ? + else{ // longer + // ----- + // -------- + // ----- + if (currentLength==param.kmerSize1-1){ + if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ + long attribute=lastKmer.getLong(1); + if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); + } + + if (secondLastKmer.getLong(2)== lastKmer.getLong(2)){ + lastKmer = RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); + }else{ + lastKmer= RowFactory.create(lastKmer.getSeq(0), attribute, secondLastKmer.getLong(2)); + } - private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ - int relativeShiftSize = shiftingLength % 31; - int endingBlockIndex = (shiftingLength-1)/31; - int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); - long[] shiftOutBlocks = new long[endingBlockIndex+1]; + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - if (shiftingLength > nucleotideLength){ - return blocks; - // throw new Exception("shifting length longer than the kmer length"); - } + secondLastKmer= fullKmer; + lastKmer=null; + }else if(dynamicSubKmerComparator(fullKmer.getSeq(0), lastKmer.getSeq(0))){ // need to be decided together with the next in coming k-mer + kmerList.add(secondLastKmer); - for (int i=0; i 0) { - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 - shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); - }else{ // relativeShiftSize == 0; - if (endingBlockIndex+1 == blocks.length) { // a block with C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - }else{ // endingBlockIndex < blocks.length -1 means a block without C marker - shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; - shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC - } + secondLastKmer=fullKmer; + lastKmer=null; - } + } + } + // ----- + // -------- + // -------- + else{ + if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0), secondLastKmer.getSeq(0))){ + /* + long attribute=lastKmer.getLong(1); + if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); + } - return shiftOutBlocks; - } + lastKmer=RowFactory.create(lastKmer.getSeq(0), attribute, lastKmer.getLong(2)); - private String BinaryBlocksToString (long[] binaryBlocks){ - String KmerString=""; - int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(fullKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(fullKmer.getLong(1)), getLeftMarker(fullKmer.getLong(1)), -1); + } - for (int i=0; i< KmerLength; i++){ - Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - KmerString += currentNucleotide; - } + fullKmer = RowFactory.create(fullKmer.getSeq(0),attribute, fullKmer.getLong(2)); +*/ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + kmerList.add(fullKmer); - return KmerString; - } + secondLastKmer=null; + lastKmer=null; + }else if (dynamicSubKmerComparator(lastKmer.getSeq(0), secondLastKmer.getSeq(0))){ + long attribute=lastKmer.getLong(1); + if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); + } - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0) { - nucleotide = 'A'; - } else if (twoBits == 1) { - nucleotide = 'C'; - } else if (twoBits == 2) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; + if (lastKmer.getLong(2) == secondLastKmer.getLong(2)){ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + }else{ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + } - } + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + secondLastKmer=fullKmer; + lastKmer=null; + }else{ // the two long kmer will be decided together with the next incoming k-mer + kmerList.add(secondLastKmer); - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; - } + secondLastKmer = lastKmer; + lastKmer= fullKmer; + } + } + } + } + // -------- + // ? + // ? + else{ + // -------- + // ----- + // ? + if (lastLength==param.kmerSize1-1){ + // -------- + // ----- + // ----- + if (currentLength==param.kmerSize1-1){ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ + long attribute=secondLastKmer.getLong(1); + if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(secondLastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), getLeftMarker(secondLastKmer.getLong(1)), -1); + } - private int getLeftMarker(long attribute){ - int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + }else{ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + } - if (leftMarker>30000){ - leftMarker=30000-leftMarker; - } + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - return leftMarker; - } + secondLastKmer=fullKmer; + lastKmer=null; + }else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - private int getRightMarker(long attribute){ - int rightMarker = (int) attribute; + secondLastKmer=fullKmer; + lastKmer=null; + } + } + // -------- + // ----- + // -------- + else{ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0)) ){ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + kmerList.add(fullKmer); - if (rightMarker>30000){ - rightMarker=30000-rightMarker; - } + secondLastKmer=null; + lastKmer=null; + }else if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ + long attribute=secondLastKmer.getLong(1); + if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(secondLastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), getLeftMarker(secondLastKmer.getLong(1)), -1); + } - return rightMarker; - } + if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + }else{ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + } - } + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - /** - * - */ + secondLastKmer=fullKmer; + lastKmer=null; + }else if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ // the last two need to be decided together with the incoming k-mer + + kmerList.add(secondLastKmer); + secondLastKmer= lastKmer; + lastKmer=fullKmer; - class DSFilterForkSubKmer implements MapPartitionsFunction, Serializable { - List HighCoverageSubKmer = new ArrayList(); -// Tuple2> HighCoverKmer=null; -// new Tuple2>("", - // new Tuple4(0, "", 0, 0)); + }else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); - public Iterator call(Iterator s) { - while (s.hasNext()) { - Row subKmer = s.next(); - if (HighCoverageSubKmer.size() == 0) { - HighCoverageSubKmer.add( - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), subKmer.getInt(3), -1) - ); - } else { - if (subKmerSlotComparator(subKmer.getSeq(0), HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getSeq(0)) == true) { - if (subKmer.getInt(3) > HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getInt(3)) { - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), subKmer.getInt(3), param.subKmerSize) - ); - } else if (subKmer.getInt(3) == HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getInt(3)) { - if (subKmer.getLong(2) > HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(2)) { - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), subKmer.getInt(3), param.subKmerSize) - ); - } else { - /** - * can be optimized - */ - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), subKmer.getInt(3), param.subKmerSize) - ); + secondLastKmer=fullKmer; + lastKmer=null; + } + } + } + // -------- + // -------- + // ? + else{ + // -------- + // -------- + // ----- + if (currentLength==param.kmerSize1-1){ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),fullKmer.getSeq(0)) && dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0)) ){ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + kmerList.add(fullKmer); + + secondLastKmer=null; + lastKmer=null; + }else if (dynamicSubKmerComparator(fullKmer.getSeq(0),lastKmer.getSeq(0))){ // the last two need to be decided together with the incoming kmer + kmerList.add(secondLastKmer); + secondLastKmer=lastKmer; + lastKmer=fullKmer; + }else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + + secondLastKmer=fullKmer; + lastKmer=null; + } + } + // -------- + // -------- + // -------- + else{ // the last two need to be decided together with the incoming k-mer + kmerList.add(secondLastKmer); + + secondLastKmer=lastKmer; + lastKmer = fullKmer; } - } else { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), subKmer.getInt(3), param.subKmerSize) - ); } - } else { - HighCoverageSubKmer.add( - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), subKmer.getInt(3), -1) - ); } } } - return HighCoverageSubKmer.iterator(); - } - - private boolean subKmerSlotComparator(Seq a, Seq b) { - for (int i = 0; i < a.length(); i++) { - if (!a.apply(i).equals(b.apply(i))) { - return false; - } - } - - return true; - } - } - - /** - * choose one kmer from a fork with higher coverage. - */ - - - class DSFilterForkSubKmerWithErrorCorrection implements MapPartitionsFunction, Serializable { - List HighCoverageSubKmer = new ArrayList(); -// Tuple2> HighCoverKmer=null; -// new Tuple2>("", - // new Tuple4(0, "", 0, 0)); - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - Row subKmer = s.next(); - int reflexivMarker = getReflexivMarker(subKmer.getLong(1)); - int leftMarker = getLeftMarker(subKmer.getLong(1)); - int rightMarker = getRightMarker(subKmer.getLong(1)); - - long[] subKmerArray = seq2array(subKmer.getSeq(0)); - long attribute=0; + if (secondLastKmer!=null){ + if (lastKmer!=null){ + int secondLastLength = currentKmerSizeFromBinaryBlockArray(seq2array(secondLastKmer.getSeq(0))); + int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(lastKmer.getSeq(0))); + // ----- + // ? + if (secondLastLength==param.kmerSize1-1){ + // ----- + // ----- + if (lastLength==param.kmerSize1-1){ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + } + // ----- + // -------- + else{ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ + long attribute=lastKmer.getLong(1); + if (getRightMarker(secondLastKmer.getLong(1))<0 && getRightMarker(lastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(lastKmer.getLong(1)), getLeftMarker(lastKmer.getLong(1)), -1); + } - int currentSubKmerSize= currentKmerSizeFromBinaryBlockArray(subKmerArray); - int maxKmerSize = param.kmerListInt[param.kmerListInt.length-1]; + if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + }else{ + lastKmer=RowFactory.create(lastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + } - if (HighCoverageSubKmer.size() == 0) { - attribute = buildingAlongFromThreeInt(reflexivMarker,leftMarker, -1); - HighCoverageSubKmer.add( - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } else { - int highestLeftMarker = getLeftMarker(HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(1)); - if (subKmerSlotComparator(subKmer.getSeq(0), HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getSeq(0)) == true) { - if (leftMarker > highestLeftMarker) { - if (highestLeftMarker <= param.minErrorCoverage && leftMarker >= 2 * highestLeftMarker) { - attribute = buildingAlongFromThreeInt(reflexivMarker, leftMarker, -1); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } else { - attribute = buildingAlongFromThreeInt(reflexivMarker, leftMarker, maxKmerSize+3); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } - } else if (leftMarker == highestLeftMarker) { - if (subKmer.getLong(2) > HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(2)) { - attribute = buildingAlongFromThreeInt(reflexivMarker, leftMarker, maxKmerSize+3); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } else { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - reflexivMarker=getReflexivMarker(subKmer.getLong(1)); - leftMarker=getLeftMarker(subKmer.getLong(1)); - // rightMarker=getRightMarker(subKmer.getLong(1)); - currentSubKmerSize=currentKmerSizeFromBinaryBlockArray(subKmerArray); - attribute = buildingAlongFromThreeInt(reflexivMarker,leftMarker,maxKmerSize+3); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } - } else { - if (leftMarker <= param.minErrorCoverage && highestLeftMarker >= 2 * leftMarker) { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - reflexivMarker=getReflexivMarker(subKmer.getLong(1)); - leftMarker=getLeftMarker(subKmer.getLong(1)); - rightMarker=getRightMarker(subKmer.getLong(1)); - // currentSubKmerSize=currentKmerSizeFromBinaryBlockArray((long[])subKmer.get(0)); - attribute = buildingAlongFromThreeInt(reflexivMarker,leftMarker,-1); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } else { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - reflexivMarker=getReflexivMarker(subKmer.getLong(1)); - leftMarker=getLeftMarker(subKmer.getLong(1)); - // rightMarker=getRightMarker(subKmer.getLong(1)); - currentSubKmerSize=currentKmerSizeFromBinaryBlockArray(subKmerArray); - attribute = buildingAlongFromThreeInt(reflexivMarker,leftMarker,maxKmerSize+3); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + }else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); } } - } else { - attribute = buildingAlongFromThreeInt(reflexivMarker, leftMarker, -1); - HighCoverageSubKmer.add( - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); } - } - - // System.out.println("first leftMarker: " + leftMarker + " new leftMarker: " + getLeftMarker(attribute)); - } + // -------- + // ? + else{ + // -------- + // ----- + if (lastLength==param.kmerSize1-1){ + if (dynamicSubKmerComparator(secondLastKmer.getSeq(0),lastKmer.getSeq(0))){ + long attribute=secondLastKmer.getLong(1); + if (getRightMarker(lastKmer.getLong(1))<0 && getRightMarker(secondLastKmer.getLong(1))>=0){ + attribute= buildingAlongFromThreeInt(getReflexivMarker(secondLastKmer.getLong(1)), getLeftMarker(secondLastKmer.getLong(1)), -1); + } - return HighCoverageSubKmer.iterator(); - } + if (secondLastKmer.getLong(2) == lastKmer.getLong(2)){ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, secondLastKmer.getLong(2)); + }else{ + secondLastKmer=RowFactory.create(secondLastKmer.getSeq(0),attribute, lastKmer.getLong(2)); + } - private boolean subKmerSlotComparator(Seq a, Seq b) { - for (int i = 0; i < a.length(); i++) { - if (!a.apply(i).equals(b.apply(i))) { - return false; + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + } + } + // -------- + // -------- + else{ + kmerList.add(secondLastKmer); + kmerList.add(lastKmer); + } + } + }else{ + kmerList.add(secondLastKmer); } + }else if (lastKmer !=null){ + kmerList.add(lastKmer); } - return true; - } - - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); - } - return array; - } - - private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ - int kmerSize; - int blockSize = binaryBlocks.length; - kmerSize= (blockSize-1) *31; - final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- - int lastMers = Long.SIZE/2-suffix0s/2-1; - - kmerSize+=lastMers; - return kmerSize; - + return kmerList.iterator(); } - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; - } + private long[] reverseBinaryBlocks(long[] blocks){ + int length = currentKmerSizeFromBinaryBlockArray(blocks); + int blockNumber= blocks.length; + long[] newBlocks= new long[blockNumber]; + int reverseIndex; + int reverseBlockIndex; + int relativeReverseIndex; - private int getLeftMarker(long attribute){ - int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + int forwardBlockIndex; - if (leftMarker>30000){ - leftMarker=30000-leftMarker; - } + long twoBits; + for (int i=0; i>>2*(31-relativeReverseIndex); + twoBits&=3L; - if (rightMarker>30000){ - rightMarker=30000-rightMarker; + newBlocks[forwardBlockIndex]|=twoBits; + newBlocks[forwardBlockIndex] <<=2; } + int lastBlockShift=31-(length-1)%31-1; + newBlocks[newBlocks.length-1] <<=2*lastBlockShift; + newBlocks[newBlocks.length - 1] |= (1L << 2 * (lastBlockShift)); - return rightMarker; + return newBlocks; } private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ @@ -3886,196 +2303,186 @@ private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int ri return info; } - } + private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { + int startingBlockIndex = (shiftingLength)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length - class DSFilterForkReflectedSubKmer implements MapPartitionsFunction, Serializable { - List HighCoverageSubKmer = new ArrayList(); - Integer HighCoverLastCoverage = 0; -// Row HighCoverKmer=null; -// new Row("", - // new Tuple4(0, "", 0, 0)); + int remainLength=nucleotideLength-shiftingLength-1; + if (remainLength <0){ + remainLength=0; + } + long[] newBlock = new long[remainLength/31+1]; + int relativeShiftSize = shiftingLength % 31; - public Iterator call(Iterator s) { - while (s.hasNext()) { - Row subKmer = s.next(); - if (HighCoverageSubKmer.size() == 0) { - HighCoverLastCoverage = subKmer.getInt(3); - HighCoverageSubKmer.add( - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), -1, subKmer.getInt(4)) - ); - } else { - if (subKmerSlotComparator(subKmer.getSeq(0), HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getSeq(0)) == true) { - if (subKmer.getInt(3) > HighCoverLastCoverage) { - HighCoverLastCoverage = subKmer.getInt(3); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), param.subKmerSize, subKmer.getInt(4)) - ); - } else if (subKmer.getInt(3) == HighCoverLastCoverage) { - int subKmerFirstSuffixLength = Long.SIZE / 2 - (Long.numberOfLeadingZeros(subKmer.getLong(2)) / 2 + 1); - int HighCoverageSubKmerFirstSuffixLength = Long.SIZE / 2 - ((Long.numberOfLeadingZeros(HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(2))) / 2 + 1); - Long subKmerFirstSuffix = subKmer.getLong(2) >>> 2 * (subKmerFirstSuffixLength - 1); - Long HighCoverageSubKmerFirstSuffix = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(2) >>> 2 * (HighCoverageSubKmerFirstSuffixLength); - - if (subKmerFirstSuffix.compareTo(HighCoverageSubKmerFirstSuffix) > 0) { - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), param.subKmerSize, subKmer.getInt(4)) - ); - } else { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); // re assign - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), param.subKmerSize, subKmer.getInt(4)) - ); - } - } else { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), param.subKmerSize, subKmer.getInt(4)) - ); - } - } else { - HighCoverLastCoverage = subKmer.getInt(3); - HighCoverageSubKmer.add( - RowFactory.create(subKmer.getSeq(0), subKmer.getInt(1), subKmer.getLong(2), -1, subKmer.getInt(4)) - ); - } + if (shiftingLength >= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } + + // if (relativeShiftSize ==0) then only shifting blocks + + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary + + j++; + } + + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker + + return newBlock; + + } + + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; + + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; + } + + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC } + + } + + return shiftOutBlocks; + } + + private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { + int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); + int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; + int leftVacancy = 31-leftRelativeNTLength; + int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); + int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; + long[] newBlocks= new long[combinedBlockSize]; + + if (rightNucleotideLength==0){ + return leftBlocks; } - return HighCoverageSubKmer.iterator(); - } + if (leftNucleotideLength==0){ + return rightBlocks; + } - private boolean subKmerSlotComparator(Seq a, Seq b) { - for (int i = 0; i < a.length(); i++) { - if (!a.apply(i).equals(b.apply(i))) { - return false; + if (leftVacancy ==0){ // left last block is a perfect block + for (int i =0; i, Serializable { - List HighCoverageSubKmer = new ArrayList(); - Integer HighCoverLastCoverage = 0; -// Row HighCoverKmer=null; -// new Tuple2>("", - // new Tuple4(0, "", 0, 0)); + for (int j=leftBlocks.length;j call(Iterator s) { + long[] shiftOutBlocks = leftShiftOutFromArray(rightBlocks, leftVacancy); // right shift out for the left. here we only expect one block, because leftVacancy is relative to one block + for (int i =0; i>> 2*(leftRelativeNTLength)); + if (leftBlocks.length HighCoverLastCoverage) { - if (HighCoverLastCoverage <= param.minErrorCoverage && leftMarker >= 2 * HighCoverLastCoverage) { - HighCoverLastCoverage = leftMarker; - attribute = buildingAlongFromThreeInt(reflexivMarker, -1, rightMarker); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } else { - HighCoverLastCoverage = leftMarker; - attribute = buildingAlongFromThreeInt(reflexivMarker, maxKmerSize+3, rightMarker); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) - ); - } - } else if (leftMarker == HighCoverLastCoverage) { - int subKmerFirstSuffixLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(subKmer.getLong(2)) / 2 + 1); - int HighCoverageSubKmerFirstSuffixLength = Long.SIZE / 2 - ((Long.numberOfTrailingZeros(HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(2))) / 2 + 1); - Long subKmerFirstSuffix = subKmer.getLong(2) >>> 2 * (32-subKmerFirstSuffixLength); - Long HighCoverageSubKmerFirstSuffix = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(2) >>> 2 * (32-HighCoverageSubKmerFirstSuffixLength); - - if (subKmerFirstSuffix.compareTo(HighCoverageSubKmerFirstSuffix) > 0) { - attribute = buildingAlongFromThreeInt(reflexivMarker, maxKmerSize+3, rightMarker); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), - attribute, subKmer.getLong(2)) - ); - } else { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); // re assign - reflexivMarker=getReflexivMarker(subKmer.getLong(1)); - // leftMarker=getLeftMarker(subKmer.getLong(1)); - rightMarker=getRightMarker(subKmer.getLong(1)); - currentSubKmerSize=currentKmerSizeFromBinaryBlockArray(subKmerArray); - - attribute= buildingAlongFromThreeInt(reflexivMarker,maxKmerSize+3, rightMarker); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), - attribute, subKmer.getLong(2)) - ); - } - } else { - if (leftMarker <= param.minErrorCoverage && HighCoverLastCoverage >= 2 * leftMarker) { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - - reflexivMarker=getReflexivMarker(subKmer.getLong(1)); - rightMarker=getRightMarker(subKmer.getLong(1)); - attribute= buildingAlongFromThreeInt(reflexivMarker,-1, rightMarker); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), - attribute, subKmer.getLong(2)) - ); - } else { - subKmer = HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1); - - reflexivMarker=getReflexivMarker(subKmer.getLong(1)); - //leftMarker=getLeftMarker(subKmer.getLong(1)); - rightMarker=getRightMarker(subKmer.getLong(1)); - currentSubKmerSize=currentKmerSizeFromBinaryBlockArray(subKmerArray); - - attribute = buildingAlongFromThreeInt(reflexivMarker, maxKmerSize+3, rightMarker); - HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, - RowFactory.create(subKmer.getSeq(0), - attribute, subKmer.getLong(2)) - ); - } - } - } else { - HighCoverLastCoverage = leftMarker; - attribute = buildingAlongFromThreeInt(reflexivMarker,-1, rightMarker); + //System.out.println(" left Blocks:" + leftBlocksString + " Right blocks: " + rightBlocksString + " rightLength: " + rightNucleotideLength + " leftNucleotideLength: " + leftNucleotideLength + " leftRelativeNTLength: " + leftRelativeNTLength + " leftVacancy: " + leftVacancy + " rightNucleotideLength: " + rightNucleotideLength + " combinedBlockSize: " + combinedBlockSize + " newBlock: " + mergedKmer); + } - HighCoverageSubKmer.add( - RowFactory.create(subKmer.getSeq(0), - attribute, subKmer.getLong(2)) - ); - } - } + return newBlocks; + } - // System.out.println("second leftMarker: " + leftMarker + " new leftMarker: " + getLeftMarker(attribute)); - } + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; - return HighCoverageSubKmer.iterator(); } - private boolean subKmerSlotComparator(Seq a, Seq b) { - for (int i = 0; i < a.length(); i++) { - if (!a.apply(i).equals(b.apply(i))) { + private boolean dynamicSubKmerComparator(Seq a, Seq b) throws Exception { + long[] arrayA = seq2array(a); + long[] arrayB = seq2array(b); + + int aLength= currentKmerSizeFromBinaryBlockArray(arrayA); + int bLength= currentKmerSizeFromBinaryBlockArray(arrayB); + + if (aLength>bLength){ // equal should not happen + long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); + // String longer = BinaryBlocksToString(shorterVersion); + // String shorter = BinaryBlocksToString(arrayB); + // System.out.println("longer: " + longer + " shorter: " + shorter); + // if (shorterVersion.length>=2 && arrayB.length >=2) { + // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); + //} + if (Arrays.equals(shorterVersion, arrayB)){ + // if (shorterVersion.length>=2){ + // System.out.println("marker!!!"); + // } + return true; + }else{ + return false; + } + }else{ + long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); + if (Arrays.equals(shorterVersion, arrayA)){ + return true; + }else{ return false; } } - - return true; } private long[] seq2array(Seq a){ @@ -4086,15 +2493,32 @@ private long[] seq2array(Seq a){ return array; } - private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ - int kmerSize; - int blockSize = binaryBlocks.length; - kmerSize= (blockSize-1) *31; - final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- - int lastMers = Long.SIZE/2-suffix0s/2-1; + private String BinaryBlocksToString (long[] binaryBlocks){ + String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - kmerSize+=lastMers; - return kmerSize; + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + KmerString += currentNucleotide; + } + + return KmerString; + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; } @@ -4125,114 +2549,210 @@ private int getRightMarker(long attribute){ return rightMarker; } - private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ - long info = (long) ReflexivMarker <<2*(32-1); //move to the left most + private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ + long maxSubKmerBinary = ~((~0L) << 2 * 31); + long newMarker = oldMarker & maxSubKmerBinary; + newMarker |= ((long) reflexivMarker) << 2*(32-1); + return newMarker; + } + } - /** - * shorten the int and change negative to positive to avoid two's complementary - */ - if (leftCover>=30000){ - leftCover=30000; - }else if (leftCover<=-30000){ - leftCover=30000-(-30000); - }else if (leftCover<0){ - leftCover=30000-leftCover; - } + /** + * + */ + class ShorterKmerNeutralization implements MapPartitionsFunction, Serializable{ + List LongerFullKmer = new ArrayList(); + List newFullKmerList = new ArrayList(); + Row shorterFullKmer; + List tempLongerFullKmer = new ArrayList(); + boolean neutralizeMarker = false; - if (rightCover>=30000){ - rightCover=30000; - }else if (rightCover<=-30000){ - rightCover=30000-(-30000); - }else if (rightCover<0){ - rightCover=30000-rightCover; - } + public Iterator call(Iterator s) throws Exception { + // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); + // System.out.println(timestamp + "RepeatCheck ShorterKmerNeutralization: " + param.kmerSize1); - info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left - info |= ((long) rightCover); // 01--LeftCover---RightCover - return info; - } - } + while (s.hasNext()) { + Row fullKmer = s.next(); - class DSForwardSubKmerExtraction implements MapPartitionsFunction, Serializable { - List TupleList = new ArrayList(); - Long suffixBinary; - long[] prefixBinarySlot; - Row kmerTuple; - int currentSubKmerSize; - int currentSubKmerResidue; - int currentSubKmerBlock; + // String fullKmerString = BinaryBlocksToString(seq2array(FullKmer.getSeq(0))); + // System.out.println("neutralization kmer: " + fullKmerString); +/* + long[] FullKmerArray = seq2array(FullKmer.getSeq(0)); + int FullKmerLength = currentKmerSizeFromBinaryBlockArray(FullKmerArray); + if (FullKmerLength == param.kmerSize1) { // shorter FullKmer size = param.kmerSize1 -1 + if (shorterFullKmer != null) { // already one exists + if (tempLongerFullKmer.size() > 0) { + if (getLeftMarker(shorterFullKmer.getLong(1)) > 0) { + for (int i = 0; i < tempLongerFullKmer.size(); i++) { + if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { + neutralizeMarker=true; + newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), true)); + } else { + newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); + } + } + } else { // adding temp to output without changing + for (int i = 0; i < tempLongerFullKmer.size(); i++) { + if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { + neutralizeMarker=true; + } + newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); + } + } + tempLongerFullKmer=new ArrayList(); + } + } - public Iterator call(Iterator s) { + if(neutralizeMarker==true){ + neutralizeMarker=false; // reset marker + }else{ + if (shorterFullKmer!=null) { + newFullKmerList.add(shorterFullKmer); + } + } + shorterFullKmer = FullKmer; + // newFullKmerList.add(FullKmer); + } else { // it is a longer K-mer + if (shorterFullKmer == null) { + tempLongerFullKmer.add(FullKmer); + } else { + if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), FullKmer.getSeq(0)) == true) { + neutralizeMarker=true; + if (getLeftMarker(shorterFullKmer.getLong(1)) > 0) { + newFullKmerList.add(enlighteningLeft(FullKmer, true)); + } else { + newFullKmerList.add(enlighteningLeft(FullKmer, false)); + } + } else { // longer Kmer not overlap to shorter k-mer anymore, a new round starts + if (getLeftMarker(shorterFullKmer.getLong(1)) > 0) { + for (int i = 0; i < tempLongerFullKmer.size(); i++) { + if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { + neutralizeMarker=true; + newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), true)); + } else { + newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); + } + } + } else { // adding temp to output without changing - while (s.hasNext()) { - kmerTuple = s.next(); - /** - * normal Sub-kmer - * Kmer ATGCACGTTATG - * Sub-Kmer ATGCACGTTAT marked as Integer 1 in Tuple2 - * Left -----------G - */ - currentSubKmerSize= currentKmerSizeFromBinaryBlockArray((long[])kmerTuple.get(0))-1; // current sub kmer = kmerTuple -1 - currentSubKmerResidue = (currentSubKmerSize-1)%31 +1; - currentSubKmerBlock = (currentSubKmerSize-1)/31+1; - - - if (currentSubKmerSize == 31) { // currentSubKmerBlock == previousSubKmerBlock -1 - prefixBinarySlot = new long[currentSubKmerBlock]; - - suffixBinary = ((long[]) kmerTuple.get(0))[currentSubKmerBlock-1]; // last block XC---------- C marker keep it - for (int i = 0; i < currentSubKmerBlock; i++) { - prefixBinarySlot[i] = ((long[]) kmerTuple.get(0))[i]; + for (int i = 0; i < tempLongerFullKmer.size(); i++) { + if (dynamicSubKmerComparator(shorterFullKmer.getSeq(0), tempLongerFullKmer.get(i).getSeq(0)) == true) { + neutralizeMarker=true; + } + newFullKmerList.add(enlighteningLeft(tempLongerFullKmer.get(i), false)); + } + } + + if (neutralizeMarker==true){ + neutralizeMarker=false; + }else{ + newFullKmerList.add(shorterFullKmer); + } + + tempLongerFullKmer= new ArrayList(); + shorterFullKmer = null; + tempLongerFullKmer.add(FullKmer); + } } - } else { // currentSubKmerBlock == previousSubKmerBlock - prefixBinarySlot = new long[currentSubKmerBlock]; + } +*/ - suffixBinary = (((long[]) kmerTuple.get(0))[currentSubKmerBlock-1] - << (2*currentSubKmerResidue)); // include C marker + if (LongerFullKmer.size() == 0) { + LongerFullKmer.add( + RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) + ); + } else { + int currentLength= currentKmerSizeFromBinaryBlockArray(seq2array(fullKmer.getSeq(0))); + int lastLength = currentKmerSizeFromBinaryBlockArray(seq2array(LongerFullKmer.get(LongerFullKmer.size() - 1).getSeq(0))); + if ( currentLength== lastLength ){ // two kmer with equal size + LongerFullKmer.add( + RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) + ); + } else if (dynamicSubKmerComparator(fullKmer.getSeq(0), LongerFullKmer.get(LongerFullKmer.size() - 1).getSeq(0)) == true) { + long[] lastKmer = seq2array(LongerFullKmer.get(LongerFullKmer.size() - 1).getSeq(0)); + long[] currentKmer = seq2array(fullKmer.getSeq(0)); + int lastKmerLength = currentKmerSizeFromBinaryBlockArray(lastKmer); + int currentKmerLength = currentKmerSizeFromBinaryBlockArray(currentKmer); - for (int i = 0; i < currentSubKmerBlock; i++) { - prefixBinarySlot[i] = ((long[]) kmerTuple.get(0))[i]; + if (lastKmerLength >currentKmerLength){ + continue; + }else{ + LongerFullKmer.remove(LongerFullKmer.size() - 1); + LongerFullKmer.add( + RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) + ); + } + } else { + LongerFullKmer.add( + RowFactory.create(fullKmer.getSeq(0), fullKmer.getLong(1)) + ); } - - long currentSubKmerResidueBinary = ~0L<< 2*(32-currentSubKmerResidue); // 1111111111------ - - prefixBinarySlot[currentSubKmerBlock - 1] = ((long[]) kmerTuple.get(0))[currentSubKmerBlock - 1] & currentSubKmerResidueBinary; - prefixBinarySlot[currentSubKmerBlock - 1] |= 1L <<2*(32-currentSubKmerResidue-1); // add C marker } - long attribute = buildingAlongFromThreeInt(1, kmerTuple.getInt(1), kmerTuple.getInt(1)); - - // System.out.println("Coverage: " + kmerTuple.getInt(1) + " before long: " + ((long[])kmerTuple.get(0))[0] + " after long: " + prefixBinarySlot[0]); + } +/* + if (shorterFullKmer==null){ + for (int i=0;i0){ + for (int i=0;i>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - KmerString += currentNucleotide; - } - - return KmerString; - } - - private int getReflexivMarker(long attribute){ - int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker - return reflexivMarker; - } - - private int getLeftMarker(long attribute){ - int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker - int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker - - if (leftMarker>30000){ - leftMarker=30000-leftMarker; - } - - return leftMarker; - } - - private int getRightMarker(long attribute){ - int rightMarker = (int) attribute; - - if (rightMarker>30000){ - rightMarker=30000-rightMarker; - } - - return rightMarker; - } - - } - - - /** - * - */ - - - class DSReflectedSubKmerExtractionFromForward implements MapPartitionsFunction, Serializable { - List TupleList = new ArrayList(); - Long suffixBinary; - long[] prefixBinarySlot; - Row kmerTuple; - // int shift = (2 * (param.subKmerSizeResidue - 1)); - Long maxSubKmerResdueBinary; - Long maxSubKmerBinary = ~((~0L) << 2 * 31); - - int currentSubKmerSize; - int currentSubKmerResidue; - int currentSubKmerBlock; - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - kmerTuple = s.next(); + int aLength= currentKmerSizeFromBinaryBlockArray(arrayA); + int bLength= currentKmerSizeFromBinaryBlockArray(arrayB); - long[] kmerTupleArray = seq2array(kmerTuple.getSeq(0)); - - // String before = BinaryBlocksToString(kmerTupleArray); - long[] beforeSuffixLong = new long[1]; - beforeSuffixLong[0]=kmerTuple.getLong(2); - // String beforeSuffix = BinaryBlocksToString(beforeSuffixLong); - - currentSubKmerSize= currentKmerSizeFromBinaryBlockArray(kmerTupleArray); - currentSubKmerResidue = (currentSubKmerSize-1)%31 +1; - currentSubKmerBlock = (currentSubKmerSize-1)/31+1; - maxSubKmerResdueBinary= ((~0L) << 2 * (32-currentSubKmerResidue)); - - long[] prefixBinarySlot = new long[currentSubKmerBlock]; - - /** - * reflected Sub-kmer - * Kmer ATGCACGTTATG - * Sub-Kmer ATGCACGTTAT marked as Integer 1 in Tuple2 - * Left -----------G - */ - // suffixBinary = 3L << shift; - suffixBinary = (Long) kmerTuple.getSeq(0).apply(0) >>> 2*(32-1); // xx--------- -> ----------xx - suffixBinary <<= 2*(32-1); // ---------xx -> xx000000000 - // suffixBinary >>>= shift; - suffixBinary |= (1L << 2*(32-1-1)); // add C marker in the front 0100 = 4L - - long transmitBit1 = (Long) kmerTuple.getSeq(0).apply(currentSubKmerBlock - 1) >>> 2 * (32 - 1); // xx------------- - prefixBinarySlot[currentSubKmerBlock - 1] = ((Long) kmerTuple.getSeq(0).apply(currentSubKmerBlock - 1) & maxSubKmerResdueBinary) << 2; - //prefixBinarySlot[currentSubKmerBlock - 1] &= maxSubKmerResdueBinary; - prefixBinarySlot[currentSubKmerBlock - 1] |= kmerTuple.getLong(2)>>> 2*(currentSubKmerResidue-1); // xx01-------- -> ----------xx01 - - for (int i = currentSubKmerBlock - 2; i >= 0; i--) { - long transmitBit2 = (Long) kmerTuple.getSeq(0).apply(i) >>> 2*(32-1); - - prefixBinarySlot[i] = (Long) kmerTuple.getSeq(0).apply(i) << 2; - // prefixBinarySlot[i] &= maxSubKmerBinary; - prefixBinarySlot[i] |= (transmitBit1 <<1*2); // --------xx - > --------xx-- - - transmitBit1 = transmitBit2; + if (aLength>bLength){ // equal should not happen + long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); + // String longer = BinaryBlocksToString(shorterVersion); + // String shorter = BinaryBlocksToString(arrayB); + // System.out.println("longer: " + longer + " shorter: " + shorter); + // if (shorterVersion.length>=2 && arrayB.length >=2) { + // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); + //} + if (Arrays.equals(shorterVersion, arrayB)){ + // if (shorterVersion.length>=2){ + // System.out.println("marker!!!"); + // } + return true; + }else{ + return false; + } + }else{ + long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); + if (Arrays.equals(shorterVersion, arrayA)){ + return true; + }else{ + return false; } - - long beforeMarker= kmerTuple.getLong(1) >>> 2*31; - long attribute = onlyChangeReflexivMarker(kmerTuple.getLong(1), 2); - long afterMarker= attribute >>> 2*31; - long afterMarker2= getReflexivMarker(attribute); - long leftMarker= getLeftMarker(attribute); - long rightMarker =getRightMarker(attribute); - // System.out.println("before long: " + kmerTupleArray[0] + " after long: " + prefixBinarySlot[0]); - // System.out.println("before Marker: " + beforeMarker + " after Marker: " + afterMarker + " " + afterMarker2); - // System.out.println("leftMarker: " + leftMarker + " rightMarker: " + rightMarker); - - - // String after = BinaryBlocksToString(prefixBinarySlot); - long[] afterSuffixLong = new long[1]; - afterSuffixLong[0]=suffixBinary; - // String afterSuffix = BinaryBlocksToString(afterSuffixLong); - - // System.out.println("before: " + before + " " + beforeSuffix + " after: " + afterSuffix + " " + after); - - TupleList.add( - RowFactory.create(prefixBinarySlot, attribute, suffixBinary) - ); } - - return TupleList.iterator(); - } - - private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ - int kmerSize; - int blockSize = binaryBlocks.length; - kmerSize= (blockSize-1) *31; - final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- - int lastMers = Long.SIZE/2-suffix0s/2-1; - - kmerSize+=lastMers; - return kmerSize; - } private long[] seq2array(Seq a){ @@ -4447,69 +2831,66 @@ private long[] seq2array(Seq a){ return array; } - private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ - long newMarker = oldMarker & maxSubKmerBinary; - newMarker |= ((long) reflexivMarker) << 2*(32-1); - return newMarker; - } + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; - private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ - long info = (long) ReflexivMarker <<2*(32-1); //move to the left most + if (shiftingLength > nucleotideLength){ + return blocks; + // throw new Exception("shifting length longer than the kmer length"); + } - /** - * shorten the int and change negative to positive to avoid two's complementary - */ - if (leftCover>=30000){ - leftCover=30000; - }else if (leftCover<=-30000){ - leftCover=30000-(-30000); - }else if (leftCover<0){ - leftCover=30000-leftCover; + for (int i=0; i=30000){ - rightCover=30000; - }else if (rightCover<=-30000){ - rightCover=30000-(-30000); - }else if (rightCover<0){ - rightCover=30000-rightCover; + if (relativeShiftSize > 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + } + return shiftOutBlocks; + } - info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left + private String BinaryBlocksToString (long[] binaryBlocks){ + String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - info |= ((long) rightCover); // 01--LeftCover---RightCover + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + KmerString += currentNucleotide; + } - return info; + return KmerString; } private char BinaryToNucleotide(Long twoBits) { char nucleotide; - if (twoBits == 0L) { + if (twoBits == 0) { nucleotide = 'A'; - } else if (twoBits == 1L) { + } else if (twoBits == 1) { nucleotide = 'C'; - } else if (twoBits == 2L) { + } else if (twoBits == 2) { nucleotide = 'G'; } else { nucleotide = 'T'; } return nucleotide; - } - - private String BinaryBlocksToString (long[] binaryBlocks){ - String KmerString=""; - int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - - for (int i=0; i< KmerLength; i++){ - Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - KmerString += currentNucleotide; - } - return KmerString; } + private int getReflexivMarker(long attribute){ int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker return reflexivMarker; @@ -4518,12 +2899,8 @@ private int getReflexivMarker(long attribute){ private int getLeftMarker(long attribute){ int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 - - leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker - - if (leftMarker>30000){ leftMarker=30000-leftMarker; } @@ -4541,7 +2918,6 @@ private int getRightMarker(long attribute){ return rightMarker; } - } class DSSubKmerToFullKmer implements MapPartitionsFunction, Serializable { @@ -4796,117 +3172,6 @@ private String BinaryBlocksToString (long[] binaryBlocks){ } } - /** - * - */ - - - /** - * interface class for RDD implementation, used in step 5 - */ - - /** - * interface class for RDD implementation, used in step 4 - */ - - - class DSKmerReverseComplement implements MapPartitionsFunction, Serializable { - /* a capsule for all Kmers and reverseComplementKmers */ - List kmerList = new ArrayList(); - long[] reverseComplement; - long[] forwardKmer; - Row kmerTuple; - Long lastTwoBits; - Seq kmerBinarySeq; - - int currentKmerBlockSize; - int currentKmerSize; - int currentKmerResidue; - - - public Iterator call(Iterator s) { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp+ "RepeatCheck DSKmerReverseComplement: " + param.kmerSize1); - - while (s.hasNext()) { - kmerTuple = s.next(); - kmerBinarySeq = kmerTuple.getSeq(0); - //reverseComplement=0L; - - currentKmerBlockSize = kmerBinarySeq.length(); - currentKmerSize = currentKmerSizeFromBinaryBlock(kmerBinarySeq); - currentKmerResidue = currentKmerResidueFromBlock(kmerBinarySeq); - - forwardKmer = new long[currentKmerBlockSize]; - reverseComplement = new long[currentKmerBlockSize]; - - for (int i = 0; i < currentKmerSize; i++) { - int RCindex = currentKmerSize - i - 1; // ------------- ------------- ---------**-- RC index goes reverse - // ------------- ------------- -------**---- <-- - // reverseComplement[i / 31] <<= 2; - - if (RCindex >= currentKmerSize - currentKmerResidue) { - lastTwoBits = (Long) kmerBinarySeq.apply(RCindex / 31) >>> 2 * (32-(RCindex % 31)-1); // ------------- ------------- ------|----** - lastTwoBits &= 3L; - lastTwoBits ^= 3L; - } else { // the same - lastTwoBits = (Long) kmerBinarySeq.apply(RCindex / 31) >>> 2 * (32 - (RCindex % 31) - 1); - lastTwoBits &= 3L; - lastTwoBits ^= 3L; - } - - reverseComplement[i / 31] |= lastTwoBits; - reverseComplement[i / 31] <<=2; // the order of these two lines are very important - - } - reverseComplement[(currentKmerSize-1)/31] <<= 2*(32-currentKmerResidue-1); // ---xxxxxxx -> xxxxxxx--- extra -1 because there are a vacancy from the step above - reverseComplement[(currentKmerSize-1)/31]|=(1L<<2*(32-currentKmerResidue-1)); // adding ending marker C - - - for (int i = 0; i < currentKmerBlockSize; i++) { - forwardKmer[i] = (Long) kmerTuple.getSeq(0).apply(i); - } - - kmerList.add(RowFactory.create(forwardKmer, kmerTuple.getInt(1))); - kmerList.add(RowFactory.create(reverseComplement, kmerTuple.getInt(1))); - - } - - return kmerList.iterator(); - } - - private int currentKmerResidueFromBlock(Seq binaryBlocks){ - final int suffix0s = Long.numberOfTrailingZeros((Long)binaryBlocks.apply(binaryBlocks.length()-1)); - return Long.SIZE/2 - suffix0s/2 -1; - } - - private int currentKmerSizeFromBinaryBlock(Seq binaryBlocks){ - int kmerSize; - int blockSize = binaryBlocks.length(); - kmerSize= (blockSize-1) *31; - final int suffix0s = Long.numberOfTrailingZeros((Long) binaryBlocks.apply(blockSize - 1)); // ATCG...01--- - int lastMers = Long.SIZE/2-suffix0s/2 -1; // minus last marker - - kmerSize+=lastMers; - return kmerSize; - - } - - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0L) { - nucleotide = 'A'; - } else if (twoBits == 1L) { - nucleotide = 'C'; - } else if (twoBits == 2L) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; - } - } - class DynamicKmerBinarizerFromSorted implements MapPartitionsFunction, Serializable{ List kmerList = new ArrayList(); Row units; @@ -5033,236 +3298,6 @@ private long nucleotideValue(char a) { } } - class DynamicKmerBinarizer implements MapPartitionsFunction, Serializable { - - List kmerList = new ArrayList(); - Row units; - String kmer; - int currentKmerSize; - int currentKmerBlockSize; - int cover; - char nucleotide; - long nucleotideInt; - // Long suffixBinary; - // Long[] suffixBinaryArray; - - public Iterator call(Iterator s) { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp+"RepeatCheck DynamicKmerBinarizer: " + param.kmerSize1); - - while (s.hasNext()) { - - units = s.next(); - - kmer = units.getString(0); - - if (kmer.startsWith("(")) { - kmer = kmer.substring(1); - } - - currentKmerSize= kmer.length(); - currentKmerBlockSize = (currentKmerSize-1)/31+1; // each 31 mer is a block - - if (!kmerSizeCheck(kmer, param.kmerListHash)){continue;} // the kmer length does not fit into any of the kmers in the list. - - if (units.getString(1).endsWith(")")) { - if (units.getString(1).length() >= 11) { - cover = 1000000000; - } else { - cover = Integer.parseInt(StringUtils.chop(units.getString(1))); - } - } else { - if (units.getString(1).length() >= 10) { - cover = 1000000000; - } else { - cover = Integer.parseInt(units.getString(1)); - } - } - - long[] nucleotideBinarySlot = new long[currentKmerBlockSize]; - // Long nucleotideBinary = 0L; - - for (int i = 0; i < currentKmerSize; i++) { - nucleotide = kmer.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideInt <<= 2*(32-1-(i%31)); // shift to the left [ATCGGATCC-,ATCGGATCC-] -// nucleotideBinarySlot[i / 31] <<= 2*((32-i)%32); - nucleotideBinarySlot[i / 31] |= nucleotideInt; - - // nucleotideBinary <<= 2; - // nucleotideBinary |= nucleotideInt; - } - - // marking the end of the kmer - long kmerEndMark = 1L; - kmerEndMark <<= 2*(32-1-((currentKmerSize-1)%31+1)); - nucleotideBinarySlot[param.kmerListHash.get(currentKmerSize)-1] |= kmerEndMark; // param.kmerListHash.get(currentKmerSize)] == currentKmerBlockSize - - // return - kmerList.add( - RowFactory.create(nucleotideBinarySlot, cover) - ); - } - - return kmerList.iterator(); - } - - private boolean kmerSizeCheck(String kmer, HashMap kmerList){ - if (kmerList.containsKey(kmer.length())) { - return true; - }else { - return false; - } - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - - } - - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable { - long maxKmerBits = ~((~0L) << (2 * param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - - /** - * interface class for RDD implementation, used in step 3 - * ----------- - * ------ - * ------ - * ------ - * ------ - * ------ - * ------ - */ - - - class DSFastqUnitFilter implements FilterFunction, Serializable { - public boolean call(String s) { - return s != null; - } - } - - /** - * interface class for RDD implementation, Used in step 1 - */ - - - class DSFastqFilterWithQual implements MapFunction, Serializable { - String line = ""; - int lineMark = 0; - - public String call(String s) { - if (lineMark == 2) { - lineMark++; - line = line + "\n" + s; - return null; - } else if (lineMark == 3) { - lineMark++; - line = line + "\n" + s; - return line; - } else if (s.startsWith("@")) { - line = s; - lineMark = 1; - return null; - } else if (lineMark == 1) { - line = line + "\n" + s; - lineMark++; - return null; - } else { - return null; - } - } - } - - /** - * interface class for RDD implementation, used in step 2 - */ - - /** * * @param param diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java new file mode 100644 index 0000000..1ba3be7 --- /dev/null +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSDynamicMercyKmer.java @@ -0,0 +1,4218 @@ +package uni.bielefeld.cmg.reflexiv.pipeline; + + +import com.fing.mapreduce.FourMcTextInputFormat; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapPartitionsFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder; +import org.apache.spark.sql.catalyst.encoders.RowEncoder; +import org.apache.spark.sql.catalyst.plans.logical.MapPartitions; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.storage.StorageLevel; +import scala.Tuple2; +import scala.collection.Seq; +import uni.bielefeld.cmg.reflexiv.util.DefaultParam; +import uni.bielefeld.cmg.reflexiv.util.InfoDumper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.*; + +import static org.apache.spark.sql.functions.col; + + +/** + * Created by rhinempi on 22.07.2017. + *

+ * Reflexiv + *

+ * Copyright (c) 2017. + * Liren Huang + *

+ * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +/** + * Returns an object for running the Reflexiv main pipeline. + * + * @author Liren Huang + * @version %I%, %G% + * @see + */ +public class ReflexivDSDynamicMercyKmer implements Serializable { + private long time; + private DefaultParam param; + + private InfoDumper info = new InfoDumper(); + + /** + * + */ + private void clockStart() { + time = System.currentTimeMillis(); + } + + /** + * + * @return + */ + private long clockCut() { + long tmp = time; + time = System.currentTimeMillis(); + return time - tmp; + } + + /**wc + * + * @return + */ + private SparkConf setSparkConfiguration() { + SparkConf conf = new SparkConf().setAppName("Reflexiv"); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.set("spark.kryo.registrator", "uni.bielefeld.cmg.reflexiv.serializer.SparkKryoRegistrator"); + conf.set("spark.cleaner.referenceTracking.cleanCheckpoints", "true"); + conf.set("spark.checkpoint.compress", "true"); + conf.set("spark.hadoop.mapred.max.split.size", "6000000"); + conf.set("spark.sql.files.maxPartitionBytes", "6000000"); + conf.set("spark.sql.adaptive.coalescePartitions.parallelismFirst", "false"); + conf.set("spark.sql.adaptive.advisoryPartitionSizeInBytes","12000000"); + conf.set("spark.driver.maxResultSize","1000g"); + conf.set("spark.memory.fraction","0.7"); + conf.set("spark.network.timeout","60000s"); + conf.set("spark.executor.heartbeatInterval","20000s"); + + return conf; + } + + private SparkSession setSparkSessionConfiguration(int shufflePartitions) { + SparkSession spark = SparkSession + .builder() + .appName("Reflexiv") + .config("spark.kryo.registrator", "uni.bielefeld.cmg.reflexiv.serializer.SparkKryoRegistrator") + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config("spark.cleaner.referenceTracking.cleanCheckpoints", true) + .config("spark.checkpoint.compress",true) + .config("spark.sql.shuffle.partitions", String.valueOf(shufflePartitions)) + .config("spark.sql.files.maxPartitionBytes", "6000000") + .config("spark.sql.adaptive.coalescePartitions.parallelismFirst", false) + .config("spark.sql.adaptive.advisoryPartitionSizeInBytes","12mb") + .config("spark.driver.maxResultSize","1000g") + .config("spark.memory.fraction","0.7") + .config("spark.network.timeout","60000s") + .config("spark.executor.heartbeatInterval","20000s") + .getOrCreate(); + + return spark; + } + + private Hashtable, Integer> SubKmerProbRowToHash(List s){ + Hashtable, Integer> ProbHash = new Hashtable, Integer>(); + for (int i =0; i Key = new ArrayList(); + for (int j=0; j KmerCountDS; + + StructType kmerBinaryCountTupleLongStruct = new StructType(); + kmerBinaryCountTupleLongStruct = kmerBinaryCountTupleLongStruct.add("ID", DataTypes.LongType, false); + kmerBinaryCountTupleLongStruct = kmerBinaryCountTupleLongStruct.add("kmer", DataTypes.createArrayType(DataTypes.LongType), false); + ExpressionEncoder KmerBinaryCountLongEncoder = RowEncoder.apply(kmerBinaryCountTupleLongStruct); + + Dataset markerKmerRow; + Dataset markerTupleRow; + Dataset markerRangeRow; + Dataset ContigDS; + + + Dataset FastqDS; + JavaPairRDD FastqIndex; + Dataset> FastqDSTuple; + + + if (param.inputFormat.equals("4mc")){ + Configuration baseConfiguration = new Configuration(); + + Job jobConf = Job.getInstance(baseConfiguration); + + JavaPairRDD FastqPairRDD = jsc.newAPIHadoopFile(param.inputFqPath, FourMcTextInputFormat.class, LongWritable.class, Text.class, jobConf.getConfiguration()); + + if (param.partitions > 0) { + FastqPairRDD = FastqPairRDD.repartition(param.partitions); + } + + DSInputTupleToString tupleToString = new DSInputTupleToString(); + + JavaRDD FastqRDD = FastqPairRDD.mapPartitions(tupleToString); + + FastqDS = spark.createDataset(FastqRDD.rdd(), Encoders.STRING()); + }else { + FastqDS = spark.read().text(param.inputFqPath).as(Encoders.STRING()); + + if (param.partitions > 0) { + FastqDS = FastqDS.repartition(param.partitions); + } + + if (!param.inputFormat.equals("line")) { + DSFastqFilterOnlySeq DSFastqFilterToSeq = new DSFastqFilterOnlySeq(); // for reflexiv + FastqDS = FastqDS.mapPartitions(DSFastqFilterToSeq, Encoders.STRING()); + } + + } + + + if (param.cache) { + FastqDS.cache(); + } + + + FastqIndex = FastqDS.toJavaRDD().zipWithIndex(); + + FastqDSTuple = spark.createDataset(FastqIndex.rdd(), Encoders.tuple(Encoders.STRING(), Encoders.LONG())); + + FastqDSTuple.persist(StorageLevel.DISK_ONLY()); + + ReverseComplementKmerBinaryExtractionFromDataset DSExtractRCKmerBinaryFromFastq = new ReverseComplementKmerBinaryExtractionFromDataset(); + + Dataset ReadSeedDS; + Dataset ContigSeedDS; + StructType ReadAndContigSeedStruct = new StructType(); + ReadAndContigSeedStruct = ReadAndContigSeedStruct.add("seed", DataTypes.createArrayType(DataTypes.LongType), false); + ReadAndContigSeedStruct = ReadAndContigSeedStruct.add("ID", DataTypes.LongType, false); + ReadAndContigSeedStruct = ReadAndContigSeedStruct.add("index", DataTypes.LongType, false); + ExpressionEncoder ReadAndContigSeedEncoder = RowEncoder.apply(ReadAndContigSeedStruct); + + ReadSeedDS = FastqDSTuple.mapPartitions(DSExtractRCKmerBinaryFromFastq, ReadAndContigSeedEncoder); + + + /** + * loading Kmer counts + */ + KmerCountDS = spark.read().csv(param.inputKmerPath); + + if (param.partitions > 0) { + KmerCountDS = KmerCountDS.repartition(param.partitions); + } + + DynamicKmerBinarizerFromReducedToSubKmer ReducedKmerToSubKmer= new DynamicKmerBinarizerFromReducedToSubKmer(); + markerKmerRow = KmerCountDS.mapPartitions(ReducedKmerToSubKmer, ReadAndContigSeedEncoder); + // markerTupleRow.persist(StorageLevel.DISK_ONLY()); + + markerKmerRow = markerKmerRow.union(ReadSeedDS); + + markerKmerRow = markerKmerRow.sort("seed"); + + + StructType ReadAndIndexStruct = new StructType(); + ReadAndIndexStruct = ReadAndIndexStruct.add("ID", DataTypes.LongType, false); + ReadAndIndexStruct = ReadAndIndexStruct.add("index", DataTypes.LongType, false); + ExpressionEncoder ReadAndIndexEncoder = RowEncoder.apply(ReadAndIndexStruct); + + RamenReadExtraction extractRamen = new RamenReadExtraction(); + markerTupleRow = markerKmerRow.mapPartitions(extractRamen, ReadAndIndexEncoder); + + markerTupleRow = markerTupleRow.sort("ID"); + + RamenReadRangeCal CalculateReadRange = new RamenReadRangeCal (); + + StructType ReadAndRangeStruct = new StructType(); + ReadAndRangeStruct = ReadAndRangeStruct.add("ID", DataTypes.LongType, false); + ReadAndRangeStruct = ReadAndRangeStruct.add("ranges", DataTypes.createArrayType(DataTypes.LongType), false); + ExpressionEncoder ReadAndRangeEncoder = RowEncoder.apply(ReadAndRangeStruct); + markerRangeRow = markerTupleRow.mapPartitions(CalculateReadRange, ReadAndRangeEncoder); + + Dataset FastqIndexedDS; + FastqTuple2Dataset FastqTupleChange = new FastqTuple2Dataset(); + FastqIndexedDS = FastqDSTuple.mapPartitions(FastqTupleChange,ReadAndRangeEncoder); + + FastqIndexedDS = FastqIndexedDS.union(markerRangeRow); + + FastqIndexedDS = FastqIndexedDS.sort("ID"); + + ExtractMercyKmerFromRead mercyKmerExtraction = new ExtractMercyKmerFromRead(); + + + + + ContigKmerMarkerExtraction extractContigTails = new ContigKmerMarkerExtraction(); + ContigSeedDS = markerTupleRow.mapPartitions(extractContigTails, ReadAndContigSeedEncoder); + + ContigSeedDS = ContigSeedDS.union(ReadSeedDS); + + ContigSeedDS = ContigSeedDS.sort("seed"); + + Dataset RACpairDS; + StructType RACPairStruct = new StructType(); + RACPairStruct = RACPairStruct.add("read", DataTypes.LongType, false); + RACPairStruct = RACPairStruct.add("contig", DataTypes.LongType, false); + RACPairStruct = RACPairStruct.add("index", DataTypes.IntegerType, false); + ExpressionEncoder RACPairEncoder = RowEncoder.apply(RACPairStruct); + + ReadAndContigPairs matchReadAndContig = new ReadAndContigPairs(); + RACpairDS = ContigSeedDS.mapPartitions(matchReadAndContig, RACPairEncoder); + + RACpairDS= RACpairDS.sort("read", "contig"); + + Dataset CCPairDS; + StructType CCPairStruct = new StructType(); + CCPairStruct = CCPairStruct.add("left", DataTypes.LongType, false); + CCPairStruct = CCPairStruct.add("right", DataTypes.LongType, false); + CCPairStruct = CCPairStruct.add("index", DataTypes.LongType, false); + CCPairStruct = CCPairStruct.add("seq", DataTypes.LongType, false); + ExpressionEncoder CCPairEncoder = RowEncoder.apply(CCPairStruct); + + CreatCCPairs matchContigToContig = new CreatCCPairs(); + CCPairDS = RACpairDS.mapPartitions(matchContigToContig, CCPairEncoder); + + CCPairDS = CCPairDS.sort("left", "right"); + + StructType CCPairStructCount = new StructType(); + CCPairStructCount = CCPairStructCount.add("left", DataTypes.LongType, false); + CCPairStructCount = CCPairStructCount.add("right", DataTypes.LongType, false); + CCPairStructCount = CCPairStructCount.add("index", DataTypes.LongType, false); + CCPairStructCount = CCPairStructCount.add("seq", DataTypes.LongType, false); + CCPairStructCount = CCPairStructCount.add("count", DataTypes.LongType, false); + ExpressionEncoder CCPairEncoderCount = RowEncoder.apply(CCPairStructCount); + + CCPairsToConnections filterForCCpair = new CCPairsToConnections(); + CCPairDS= CCPairDS.mapPartitions(filterForCCpair,CCPairEncoderCount); + + CCPairDS=CCPairDS.sort(col("right").asc(), col("count").desc()); + + Dataset MarkedReads; + StructType CCNetStruct = new StructType(); + CCNetStruct = CCNetStruct.add("read", DataTypes.LongType, false); + CCNetStruct = CCNetStruct.add("CCMeta", DataTypes.createArrayType(DataTypes.LongType), false); + ExpressionEncoder CCNetEncoder = RowEncoder.apply(CCNetStruct); + + CCPairsToConnectionsRight filterForCCpairRight = new CCPairsToConnectionsRight(); + MarkedReads = CCPairDS.mapPartitions(filterForCCpairRight, CCNetEncoder); + + // Dataset FastqIndexedDS; + // FastqTuple2Dataset FastqTupleChange = new FastqTuple2Dataset(); + // FastqIndexedDS = FastqDSTuple.mapPartitions(FastqTupleChange,CCNetEncoder); + + MarkedReads = MarkedReads.union(FastqIndexedDS); + + MarkedReads = MarkedReads.sort("read"); + + Dataset CCNetWithSeq; + StructType ContigSeqStruct = new StructType(); + ContigSeqStruct = ContigSeqStruct.add("ID", DataTypes.LongType, false); + ContigSeqStruct = ContigSeqStruct.add("seq", DataTypes.createArrayType(DataTypes.LongType), false); + ExpressionEncoder ContigSeqEncoder = RowEncoder.apply(ContigSeqStruct); + + ExtractReadSequenceForCCNet ReadSeqExtraction = new ExtractReadSequenceForCCNet(); + CCNetWithSeq = MarkedReads.mapPartitions(ReadSeqExtraction, ContigSeqEncoder); + + CCNetWithSeq = CCNetWithSeq.union(markerTupleRow); + CCNetWithSeq= CCNetWithSeq.sort("ID"); + + Dataset reflexivKmer; + StructType ReflexivLongKmerStructCompressed = new StructType(); + ReflexivLongKmerStructCompressed= ReflexivLongKmerStructCompressed.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); + ReflexivLongKmerStructCompressed= ReflexivLongKmerStructCompressed.add("attribute", DataTypes.LongType, false); + ReflexivLongKmerStructCompressed= ReflexivLongKmerStructCompressed.add("extension", DataTypes.createArrayType(DataTypes.LongType), false); + ExpressionEncoder ReflexivLongSubKmerEncoderCompressed = RowEncoder.apply(ReflexivLongKmerStructCompressed); + + ChangingEndsOfConnectableContigs ModifyContig = new ChangingEndsOfConnectableContigs(); + reflexivKmer = CCNetWithSeq.mapPartitions(ModifyContig, ReflexivLongSubKmerEncoderCompressed); + + DSExtendConnectableContigLoop connectContig = new DSExtendConnectableContigLoop(); + + reflexivKmer = reflexivKmer.sort("k-1"); + + //loop + for (int i=0; i<30; i++) { + reflexivKmer = reflexivKmer.sort("k-1"); + reflexivKmer = reflexivKmer.mapPartitions(connectContig, ReflexivLongSubKmerEncoderCompressed); + } + + + Dataset reflexivFullKmer; + StructType markerTupleStruct = new StructType(); + markerTupleStruct = markerTupleStruct.add("kmer", DataTypes.createArrayType(DataTypes.LongType), false); + ExpressionEncoder markerTupleEncoder = RowEncoder.apply(markerTupleStruct); + + DSBinaryFixingKmerToFullKmer FixingKmer2FullKmer = new DSBinaryFixingKmerToFullKmer(); + reflexivFullKmer = reflexivKmer.mapPartitions(FixingKmer2FullKmer, markerTupleEncoder); + + reflexivFullKmer.persist(StorageLevel.DISK_ONLY()); + JavaPairRDD ContigsRDDIndex; + ContigsRDDIndex = reflexivFullKmer.toJavaRDD().zipWithIndex(); + + Dataset> markerTuple; + markerTuple = spark.createDataset(ContigsRDDIndex.rdd(), Encoders.tuple(markerTupleEncoder, Encoders.LONG())); + + StructType ContigLongKmerStringStruct = new StructType(); + ContigLongKmerStringStruct = ContigLongKmerStringStruct.add("ID", DataTypes.StringType, false); + ContigLongKmerStringStruct = ContigLongKmerStringStruct.add("contig", DataTypes.StringType, false); + ExpressionEncoder ContigStringEncoder = RowEncoder.apply(ContigLongKmerStringStruct); + + TagRowContigRDDID DSContigIDLabel = new TagRowContigRDDID(); + ContigDS = markerTuple.flatMap(DSContigIDLabel, ContigStringEncoder); + + ContigDS.write(). + mode(SaveMode.Overwrite). + format("csv"). + option("compression", "gzip").save(param.outputPath + "/Assembly_intermediate/04Patching"); + + spark.stop(); + + } + + class TagRowContigRDDID implements FlatMapFunction, Row>, Serializable { + + List contigList; + + public Iterator call(Tuple2 s) { + + contigList = new ArrayList(); + + String contig = BinaryBlocksToString(seq2array(s._1().getSeq(0))); + int length = contig.length(); + if (length >= param.minContig) { + String ID = ">Contig-" + length + "-" + s._2(); + String formatedContig = changeLine(contig, length, 10000000); + contigList.add(RowFactory.create(ID, formatedContig)); + } + + return contigList.iterator(); + } + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + // String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + StringBuilder sb= new StringBuilder(); + char currentNucleotide; + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0L) { + nucleotide = 'A'; + } else if (twoBits == 1L) { + nucleotide = 'C'; + } else if (twoBits == 2L) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + public String changeLine(String oneLine, int lineLength, int limitedLength) { + String blockLine = ""; + int fold = lineLength / limitedLength; + int remainder = lineLength % limitedLength; + if (fold == 0) { + blockLine = oneLine; + } else if (fold == 1 && remainder == 0) { + blockLine = oneLine; + } else if (fold > 1 && remainder == 0) { + for (int i = 0; i < fold - 1; i++) { + blockLine += oneLine.substring(i * limitedLength, (i + 1) * limitedLength) + "\n"; + } + blockLine += oneLine.substring((fold - 1) * limitedLength); + } else { + for (int i = 0; i < fold; i++) { + blockLine += oneLine.substring(i * limitedLength, (i + 1) * limitedLength) + "\n"; + } + blockLine += oneLine.substring(fold * limitedLength); + } + + return blockLine; + } + } + + class DSFastqFilterOnlySeq implements MapPartitionsFunction, Serializable{ + ArrayList seqArray = new ArrayList(); + //String line; + //int lineMark = 0; + + public Iterator call(Iterator sIterator) { + while (sIterator.hasNext()) { + String s = sIterator.next(); + if (s.length()<= 20) { + continue; + } else if (s.startsWith("@")) { + continue; + } else if (s.startsWith("+")) { + continue; + } else if (!checkSeq(s.charAt(0))) { + continue; + } else if (!checkSeq(s.charAt(4))){ + continue; + } else if (!checkSeq(s.charAt(9))){ + continue; + } else if (!checkSeq(s.charAt(14))){ + continue; + } else if (!checkSeq(s.charAt(19))){ + continue; + } else { + seqArray.add(s); + } + } + + return seqArray.iterator(); + } + + private boolean checkSeq(char a){ + int match =0; + if (a=='A'){ + match++; + }else if (a=='T'){ + match++; + }else if (a=='C'){ + match++; + }else if (a=='G'){ + match++; + }else if (a=='N'){ + match++; + } + + if (match >0){ + return true; + }else{ + return false; + } + } + + /* + public Iterator call(Iterator sIterator) { + while (sIterator.hasNext()) { + String s = sIterator.next(); + if (lineMark == 2) { + lineMark++; + } else if (lineMark == 3) { + lineMark++; + seqArray.add(line); + } else if (s.startsWith("@")) { + lineMark = 1; + } else if (lineMark == 1) { + line = s; + lineMark++; + } + } + + return seqArray.iterator(); + } + */ + +/* + public String call(String s) { + if (lineMark == 2) { + lineMark++; + return null; + } else if (lineMark == 3) { + lineMark++; + return line; + } else if (s.startsWith("@")) { + lineMark = 1; + return null; + } else if (lineMark == 1) { + line = s; + lineMark++; + return null; + }else{ + return null; + } + } + */ + } + + class DSInputTupleToString implements FlatMapFunction>, String>, Serializable { + List reflexivKmerStringList = new ArrayList(); + String seq; + + public Iterator call(Iterator> sIterator) throws Exception { + while (sIterator.hasNext()) { + + Tuple2 s = sIterator.next(); + seq = s._2().toString(); +/* + if (seq.length()<= 20) { + continue; + } else if (seq.startsWith("@")) { + continue; + } else if (seq.startsWith("+")) { + continue; + } else if (!checkSeq(seq.charAt(0))) { + continue; + } else if (!checkSeq(seq.charAt(4))){ + continue; + } else if (!checkSeq(seq.charAt(9))){ + continue; + } else if (!checkSeq(seq.charAt(14))){ + continue; + } else if (!checkSeq(seq.charAt(19))){ + continue; + } else { + reflexivKmerStringList.add(seq); + } +*/ + reflexivKmerStringList.add(seq); + } + return reflexivKmerStringList.iterator(); + } + + private boolean checkSeq(char a){ + int match =0; + if (a=='A'){ + match++; + }else if (a=='T'){ + match++; + }else if (a=='C'){ + match++; + }else if (a=='G'){ + match++; + }else if (a=='N'){ + match++; + } + + if (match >0){ + return true; + }else{ + return false; + } + } + } + + class DynamicKmerBinarizerFromReducedToSubKmer implements MapPartitionsFunction, Serializable{ + List kmerList = new ArrayList(); + Row units; + String ID; + String extension; + int currentExtensionSize; + int currentExtensionBlockSize; + long attribute; + char nucleotide; + long nucleotideInt; + // Long suffixBinary; + // Long[] suffixBinaryArray; + + + public Iterator call(Iterator s) { + + while (s.hasNext()) { + units = s.next(); + + // ID = units.getString(0); + extension = units.getString(0); + + //if (ID.startsWith("(")) { + // ID = ID.substring(1); + //} + + + currentExtensionSize = extension.length(); + currentExtensionBlockSize = (currentExtensionSize-1)/31+1; + + // if (!kmerSizeCheck(kmer, param.kmerListHash)){continue;} // the kmer length does not fit into any of the kmers in the list. +/* + if (units.getString(0).endsWith(")")) { + String[] attributeStringArray = ID.split("\\-"); + attribute =Long.parseLong(attributeStringArray[2]); + // attribute = Long.parseLong(StringUtils.chop(units.getString(1))); + } else { + String[] attributeStringArray = ID.split("\\-"); + attribute =Long.parseLong(attributeStringArray[2]); + // attribute = Long.parseLong(units.getString(1)); + } +*/ + + long[] extensionBinarySlot = new long[currentExtensionBlockSize]; + + for (int i = 0; i < currentExtensionSize; i++) { + nucleotide = extension.charAt(i); + if (nucleotide >= 256) nucleotide = 255; + nucleotideInt = nucleotideValue(nucleotide); + // forward kmer in bits + nucleotideInt <<= 2*(32-1-(i%31)); // shift to the left [ATCGGATCC-,ATCGGATCC-] + + extensionBinarySlot[i / 31] |= nucleotideInt; + } + + long kmerEndMark = 1L; + + kmerEndMark <<= 2*(32-1-((currentExtensionSize-1)%31+1)); + extensionBinarySlot[currentExtensionBlockSize-1] |= kmerEndMark; // param.kmerListHash.get(currentKmerSize)] == currentKmerBlockSize + + // attribute= onlyChangeReflexivMarker(attribute,1); + kmerList.add( + RowFactory.create(extensionBinarySlot, 0, -1) // -1 as marker for k-mer more than 2 coverage + ); + } + + return kmerList.iterator(); + } + + private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ + long info = (long) ReflexivMarker <<2*(32-1); //move to the left most + + /** + * shorten the int and change negative to positive to avoid two's complementary + */ + if (leftCover>=30000){ + leftCover=30000; + }else if (leftCover<=-30000){ + leftCover=30000-(-30000); + }else if (leftCover<0){ + leftCover=30000-leftCover; + } + + if (rightCover>=30000){ + rightCover=30000; + }else if (rightCover<=-30000){ + rightCover=30000-(-30000); + }else if (rightCover<0){ + rightCover=30000-rightCover; + } + + info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left + info |= ((long) rightCover); // 01--LeftCover---RightCover + + return info; + } + + private long onlyChangeReflexivMarker(long oldMarker, int reflexivMarker){ + Long maxSubKmerBinary = ~((~0L) << 2 * 31); + long newMarker = oldMarker & maxSubKmerBinary; + newMarker |= ((long) reflexivMarker) << 2*(32-1); + return newMarker; + } + + private boolean kmerSizeCheck(String kmer, HashMap kmerList){ + if (kmerList.containsKey(kmer.length())) { + return true; + }else { + return false; + } + } + + private long nucleotideValue(char a) { + long value; + if (a == 'A') { + value = 0L; + } else if (a == 'C') { + value = 1L; + } else if (a == 'G') { + value = 2L; + } else { // T + value = 3L; + } + return value; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + KmerString += currentNucleotide; + } + + return KmerString; + } + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0L) { + nucleotide = 'A'; + } else if (twoBits == 1L) { + nucleotide = 'C'; + } else if (twoBits == 2L) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + } + + class DSBinaryFixingKmerToFullKmer implements MapPartitionsFunction, Serializable{ + List reflexivKmerStringList = new ArrayList(); + + long[] subKmerArray = new long[1]; + long[] combinedArray; + long[] extensionArray; + + + public Iterator call(Iterator sIterator) throws Exception { + while (sIterator.hasNext()) { + Row s = sIterator.next(); + + if (s.get(0) instanceof Seq){ + subKmerArray = seq2array(s.getSeq(0)); + }else{ + subKmerArray = (long[]) s.get(0); + } + + if (s.get(2) instanceof Seq) { + extensionArray = seq2array(s.getSeq(2)); + }else{ + extensionArray = (long[]) s.get(2); + } + + if (getReflexivMarker(s.getLong(1)) ==1){ + combinedArray = combineTwoLongBlocks( subKmerArray, extensionArray); + }else{ + combinedArray = combineTwoLongBlocks( extensionArray, subKmerArray ); + } + + + if (currentKmerSizeFromBinaryBlockArray(combinedArray) < param.minContig){ + continue; + } + + + reflexivKmerStringList.add( + RowFactory.create(combinedArray) + ); + } + return reflexivKmerStringList.iterator(); + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + // String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + StringBuilder sb= new StringBuilder(); + char currentNucleotide; + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { + int startingBlockIndex = (shiftingLength)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length + + int remainLength=nucleotideLength-shiftingLength-1; + if (remainLength <0){ + remainLength=0; + } + long[] newBlock = new long[remainLength/31+1]; + int relativeShiftSize = shiftingLength % 31; + + if (shiftingLength >= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } + + // if (relativeShiftSize ==0) then only shifting blocks + + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary + + j++; + } + + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker + + return newBlock; + + } + + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; + + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; + } + + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + + } + + return shiftOutBlocks; + } + + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private int getReflexivMarker(long attribute){ + int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker + return reflexivMarker; + } + + private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { + int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); + int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; + int leftVacancy = 31-leftRelativeNTLength; + int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); + int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; + long[] newBlocks= new long[combinedBlockSize]; + + if (rightNucleotideLength==0){ + return leftBlocks; + } + + if (leftNucleotideLength==0){ + return rightBlocks; + } + + if (leftVacancy ==0){ // left last block is a perfect block + for (int i =0; i>> 2*(leftRelativeNTLength)); + if (leftBlocks.length, Serializable{ + List ReflexibleKmerList = new ArrayList(); + List tmpConsecutiveContigs = new ArrayList(); + Row lastID=null; + long attribute; + + public Iterator call(Iterator sIterator) throws Exception{ + while (sIterator.hasNext()){ + Row s = sIterator.next(); + + if (s.getSeq(1).length()<4){ + continue; + } + + // System.out.println("contigID: " + s.getLong(0)); + + if (lastID==null){ + lastID=s; + continue; + } + + if (s.getLong(0) == lastID.getLong(0)){ + System.out.println("added: " + s); + tmpConsecutiveContigs.add(seq2array(s.getSeq(1))); + }else{ + attribute = buildingAlongFromThreeInt(1, -1, -1); + if (tmpConsecutiveContigs.size()>0){ + tmpConsecutiveContigs.add(seq2array(lastID.getSeq(1))); + System.out.println("added: " + lastID); + if (tmpConsecutiveContigs.size()>3){ + for (int i=0;i(); + }else { // lastID is alone and probably a normal contig + long[] contigArray = seq2array(lastID.getSeq(1)); + long[] subKmer = leftShiftOutFromArray(contigArray, 61); + long[] extension = leftShiftArray(contigArray, 61); + ReflexibleKmerList.add(RowFactory.create(subKmer, attribute, extension)); + } + + lastID = s; + } + } + + attribute = buildingAlongFromThreeInt(1, -1, -1); + if (tmpConsecutiveContigs.size()>0){ + tmpConsecutiveContigs.add(seq2array(lastID.getSeq(1))); + if (tmpConsecutiveContigs.size()>3){ + System.out.println("id with more than 3 contigs 2: " + lastID.getLong(0)); + } + long[] modifiedContig = updatingContig(tmpConsecutiveContigs); + ReflexibleKmerList.add(RowFactory.create(leftShiftOutFromArray(modifiedContig,61), attribute, leftShiftArray(modifiedContig, 61))); + + tmpConsecutiveContigs=new ArrayList(); + }else { // lastID is alone and probably a normal contig + long[] contigArray = seq2array(lastID.getSeq(1)); + long[] subKmer = leftShiftOutFromArray(contigArray, 61); + long[] extension = leftShiftArray(contigArray, 61); + ReflexibleKmerList.add(RowFactory.create(subKmer, attribute, extension)); + } + + return ReflexibleKmerList.iterator(); + } + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private long[] updatingContig(List contigSet) throws Exception { + long[] newContig = null; + long[] rightContig = null; + long[] leftContig = null; + + if (contigSet.size()>3){ // left and right + // interesting scenario, should not happen + System.out.println("debugging needed 1"); + }else if (contigSet.size()>2){ + + if (contigSet.get(0)[contigSet.get(0).length-2]!=0){ // the contig + if (contigSet.get(1)[contigSet.get(1).length-2] !=0){ + // another contig? + System.out.println("debugging needed 2"); + }else if (contigSet.get(2)[contigSet.get(2).length-2] !=0){ + // another contig? + System.out.println("debugging needed 3"); + } + newContig = contigSet.get(0); + + if (contigSet.get(1)[contigSet.get(1).length-1] <0){ // right contig + rightContig = contigSet.get(1); + }else if (contigSet.get(1)[contigSet.get(1).length-1]>0){ // left contig + leftContig = contigSet.get(1); + } + + if (contigSet.get(2)[contigSet.get(2).length-1] <0){ // right contig + rightContig = contigSet.get(2); + }else if (contigSet.get(2)[contigSet.get(2).length-1] >0){ // left contig + leftContig= contigSet.get(2); + } + }else if (contigSet.get(1)[contigSet.get(1).length-1]!=0){ // second one is the contig + if (contigSet.get(0)[contigSet.get(0).length-2] !=0){ + // another contig? + System.out.println("debugging needed 4"); + }else if (contigSet.get(2)[contigSet.get(2).length-2] !=0){ + // another contig? + System.out.println("debugging needed 5"); + } + newContig=contigSet.get(1); + + if (contigSet.get(0)[contigSet.get(0).length-1] <0){ // right contig + rightContig = contigSet.get(0); + }else if (contigSet.get(0)[contigSet.get(0).length-1]>0){ // left contig + leftContig = contigSet.get(0); + } + + if (contigSet.get(2)[contigSet.get(2).length-1] <0){ // right contig + rightContig = contigSet.get(2); + }else if (contigSet.get(2)[contigSet.get(2).length-1] >0){ // left contig + leftContig= contigSet.get(2); + } + + }else { // the last one is the contig + + if (contigSet.get(0)[contigSet.get(0).length-2] !=0){ + // another contig? + System.out.println("debugging needed 6"); + }else if (contigSet.get(1)[contigSet.get(1).length-2] !=0){ + // another contig? + System.out.println("debugging needed 7"); + } + newContig=contigSet.get(2); + + if (contigSet.get(0)[contigSet.get(0).length-1] <0){ // right contig + rightContig = contigSet.get(0); + }else if (contigSet.get(0)[contigSet.get(0).length-1]>0){ // left contig + leftContig = contigSet.get(0); + } + + if (contigSet.get(1)[contigSet.get(1).length-1] <0){ // right contig + rightContig = contigSet.get(1); + }else if (contigSet.get(1)[contigSet.get(1).length-1] >0){ // left contig + leftContig= contigSet.get(1); + } + + } + }else { // if (contigSet.size()==2){ + if (contigSet.get(0)[contigSet.get(0).length-2]!=0) { // the contig + newContig=contigSet.get(0); + if (contigSet.get(1)[contigSet.get(1).length-1] <0){ // right contig + rightContig = contigSet.get(1); + }else if (contigSet.get(1)[contigSet.get(1).length-1]>0){ // left contig + leftContig = contigSet.get(1); + } + }else{ + newContig=contigSet.get(1); + if (contigSet.get(0)[contigSet.get(0).length-1] <0){ // right contig + rightContig = contigSet.get(0); + }else if (contigSet.get(0)[contigSet.get(0).length-1]>0){ // left contig + leftContig = contigSet.get(0); + } + + } + } + + // System.out.println("oldContig: " + BinaryBlocksToString(newContig)); + + if (leftContig!=null){ + int contigLength = currentKmerSizeFromBinaryBlockArray(newContig); + if (leftContig[leftContig.length-1] < contigLength) { + newContig = leftShiftOutFromArray(newContig, (int)leftContig[leftContig.length-1]); + leftContig = removeTailingTwoSlots(leftContig); + + // System.out.println("leftRead: " + BinaryBlocksToString(leftContig)); + newContig = combineTwoLongBlocks(newContig, leftContig); // get all bases from read and give to leftContig + // System.out.println("newContig: " + BinaryBlocksToString(newContig)); + }else{ + // System.out.println("leftContig Index: " + leftContig[leftContig.length-1] + " is bigger or equal to contig length: " + contigLength); + } + } + + if (rightContig!=null){ + if (rightContig[rightContig.length-2]!=0){ + System.out.println("Warning: not a right Contig"); + } + int negativeIndex = (int)rightContig[rightContig.length-1]; + + if (negativeIndex>=0){ + System.out.println("right contig should not have positive index: " + negativeIndex); + } + + rightContig = removeTailingTwoSlots(rightContig); + + // System.out.println("rightRead: " + BinaryBlocksToString(rightContig)); + int readLength= currentKmerSizeFromBinaryBlockArray(rightContig); + int offset = readLength + negativeIndex; + + newContig = leftShiftArray(newContig, offset); + rightContig = leftShiftArray(rightContig, readLength-61); // shift out 31 nt for right contig + newContig = combineTwoLongBlocks(rightContig,newContig); + // System.out.println("newContig: " + BinaryBlocksToString(newContig)); + } + + + + + return newContig; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + // String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + StringBuilder sb= new StringBuilder(); + char currentNucleotide; + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + private long[] removeTailingTwoSlots(long[] withTails){ + long[] withOutTails = new long[withTails.length-2]; + for (int i=0; i=30000){ + leftCover=30000; + }else if (leftCover<=-30000){ + leftCover=30000-(-30000); + }else if (leftCover<0){ + leftCover=30000-leftCover; + } + + if (rightCover>=30000){ + rightCover=30000; + }else if (rightCover<=-30000){ + rightCover=30000-(-30000); + }else if (rightCover<0){ + rightCover=30000-rightCover; + } + + info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left + info |= ((long) rightCover); // 01--LeftCover---RightCover + + return info; + } + + private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { + int startingBlockIndex = (shiftingLength)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length + + int remainLength=nucleotideLength-shiftingLength-1; + if (remainLength <0){ + remainLength=0; + } + long[] newBlock = new long[remainLength/31+1]; + int relativeShiftSize = shiftingLength % 31; + + if (shiftingLength >= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } + + // if (relativeShiftSize ==0) then only shifting blocks + + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary + + j++; + } + + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker + + return newBlock; + + } + + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; + + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; + } + + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + + } + + return shiftOutBlocks; + } + + private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { + int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); + int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; + int leftVacancy = 31-leftRelativeNTLength; + int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); + int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; + long[] newBlocks= new long[combinedBlockSize]; + + if (rightNucleotideLength==0){ + return leftBlocks; + } + + if (leftNucleotideLength==0){ + return rightBlocks; + } + + if (leftVacancy ==0){ // left last block is a perfect block + for (int i =0; i>> 2*(leftRelativeNTLength)); + if (leftBlocks.length, Serializable{ + List CCReads = new ArrayList(); + Row lastRead=null; + Row lastMarker=null; + long[] leftContigReadMeta; + long[] rightContigReadMeta; + + public Iterator call(Iterator sIterator) throws Exception{ + while (sIterator.hasNext()){ + Row s= sIterator.next(); + + if ((Long) s.getSeq(1).apply(0) == 0) { // a marker + if (lastRead!=null) { + if (lastRead.getLong(0) == s.getLong(0)) { + // System.out.println("read: " + s.getLong(0) + " marker: " + s.getSeq(1).apply(0) + " index: " + getLeftIndex((Long) s.getSeq(1).apply(1)) + " | " + getRightIndex((Long) s.getSeq(1).apply(1)) + " leftContig: " + s.getSeq(1).apply(2) + " rightContig: " + s.getSeq(1).apply(3)); + + + int leftIndex = getLeftIndex((Long) s.getSeq(1).apply(1)); + int rightIndex = getRightIndex((Long) s.getSeq(1).apply(1)); + + long[] readSeq= seq2array(lastRead.getSeq(1)); + if ((Long) s.getSeq(1).apply(4) ==1){ + readSeq = binaryBlockReverseComplementary(readSeq); + } + + // String read= BinaryBlocksToString (readSeq); + + leftContigReadMeta = arrayWithTwoMoreSlots(readSeq); + leftContigReadMeta[leftContigReadMeta.length - 2] = 0L; + leftContigReadMeta[leftContigReadMeta.length - 1] = leftIndex; + + rightContigReadMeta = arrayWithTwoMoreSlots(readSeq); + rightContigReadMeta[rightContigReadMeta.length - 2] = 0L; + rightContigReadMeta[rightContigReadMeta.length - 1] = rightIndex; + + CCReads.add(RowFactory.create(s.getSeq(1).apply(2), leftContigReadMeta)); // contig, seqArray, 0L, index + CCReads.add(RowFactory.create(s.getSeq(1).apply(3), rightContigReadMeta)); + + // System.out.println("modify: " + s.getSeq(1).apply(2) + " left index: " + leftContigReadMeta[leftContigReadMeta.length - 1] + " readID " + lastRead.getLong(0) + " seq " + read); + // System.out.println("modify: " + s.getSeq(1).apply(3) + " right index: " + rightContigReadMeta[rightContigReadMeta.length - 1] + " readID " + lastRead.getLong(0) + " seq " + read ); + } + } + + lastMarker = s; + }else{ // a read + if (lastMarker!=null) { + + if (s.getLong(0) == lastMarker.getLong(0)) { // matches last marker + // System.out.println("read: " + lastMarker.getLong(0) + " marker: " + lastMarker.getSeq(1).apply(0) + " index: " + getLeftIndex((Long) lastMarker.getSeq(1).apply(1)) + " | " + getRightIndex((Long) lastMarker.getSeq(1).apply(1)) + " leftContig: " + lastMarker.getSeq(1).apply(2) + " rightContig: " + lastMarker.getSeq(1).apply(3)); + + + int leftIndex = getLeftIndex((Long) lastMarker.getSeq(1).apply(1)); + int rightIndex = getRightIndex((Long) lastMarker.getSeq(1).apply(1)); + + long[] readSeq= seq2array(s.getSeq(1)); + if ((Long) lastMarker.getSeq(1).apply(4) ==1){ + readSeq = binaryBlockReverseComplementary(readSeq); + } + + // String read= BinaryBlocksToString (readSeq); + + leftContigReadMeta = arrayWithTwoMoreSlots(readSeq); + leftContigReadMeta[leftContigReadMeta.length - 2] = 0L; + leftContigReadMeta[leftContigReadMeta.length - 1] = leftIndex; + + rightContigReadMeta = arrayWithTwoMoreSlots(readSeq); + rightContigReadMeta[rightContigReadMeta.length - 2] = 0L; + rightContigReadMeta[rightContigReadMeta.length - 1] = rightIndex; + + CCReads.add(RowFactory.create(lastMarker.getSeq(1).apply(2), leftContigReadMeta)); + CCReads.add(RowFactory.create(lastMarker.getSeq(1).apply(3), rightContigReadMeta)); + + // System.out.println("modify: " + lastMarker.getSeq(1).apply(2) + " left index: " + leftContigReadMeta[leftContigReadMeta.length - 1] + " readID " + s.getLong(0) + " seq " + read); + // System.out.println("modify: " + lastMarker.getSeq(1).apply(3) + " right index: " + rightContigReadMeta[rightContigReadMeta.length - 1] + " readID " + s.getLong(0) + " seq " + read); + + } + } + + lastRead=s; + } + + + } + + + return CCReads.iterator(); + } + + private int getLeftIndex(long combinedDuo){ + return (int) (combinedDuo >>> 2*16); + } + + private int getRightIndex(long combinedDuo){ + return (int) combinedDuo; + } + + private long[] arrayWithTwoMoreSlots(long[] a){ // add two slots at the end for other meta data + long[] array =new long[a.length+2]; + for (int i = 0; i < a.length; i++) { + array[i] = (Long) a[i]; + } + return array; + } + + private long[] seq2arrayWithTwoMoreSlots(Seq a){ // add two slots at the end for other meta data + long[] array =new long[a.length()+2]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private long[] binaryBlockReverseComplementary(long[] forward){ + int currentKmerResidue = currentKmerResidueFromBlockArray(forward); + int currentKmerSize = currentKmerSizeFromBinaryBlockArray(forward); + int currentKmerBlockSize=forward.length; + long[] reverseComplement; + long lastTwoBits; + + reverseComplement = new long[currentKmerBlockSize]; + + for (int i = 0; i < currentKmerSize; i++) { + int RCindex = currentKmerSize - i - 1; // ------------- ------------- ---------**-- RC index goes reverse + // ------------- ------------- -------**---- <-- + // reverseComplement[i / 31] <<= 2; + + if (RCindex >= currentKmerSize - currentKmerResidue) { + lastTwoBits = forward[RCindex / 31] >>> 2 * (32-(RCindex % 31)-1); // ------------- ------------- ------|----** + lastTwoBits &= 3L; + lastTwoBits ^= 3L; + } else { // the same + lastTwoBits = forward[RCindex / 31] >>> 2 * (32 - (RCindex % 31) - 1); + lastTwoBits &= 3L; + lastTwoBits ^= 3L; + } + + reverseComplement[i / 31] |= lastTwoBits; + reverseComplement[i / 31] <<=2; // the order of these two lines are very important + + } + reverseComplement[(currentKmerSize-1)/31] <<= 2*(32-currentKmerResidue-1); // ---xxxxxxx -> xxxxxxx--- extra -1 because there are a vacancy from the step above + reverseComplement[(currentKmerSize-1)/31]|=(1L<<2*(32-currentKmerResidue-1)); // adding ending marker C + + return reverseComplement; + } + + private int currentKmerResidueFromBlockArray(long[] binaryBlocks){ + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[binaryBlocks.length-1]); + return Long.SIZE/2 - suffix0s/2 -1; + } + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + // String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + StringBuilder sb= new StringBuilder(); + char currentNucleotide; + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + } + + class CCPairsToConnectionsRight implements MapPartitionsFunction, Serializable{ + List CCNet = new ArrayList(); + int mark=0; + Row lastHighest=null; + Row secondHighest=null; + + public Iterator call(Iterator sIterator) throws Exception{ + while (sIterator.hasNext()){ + Row s = sIterator.next(); + + /** + * C1 C2 index read + * C1 C3 index read + * C1 C4 index read + * C2 C5 index read + * C3 C6 index read + */ + + // System.out.println("leftContig: " + s.getLong(0) + " rightContig: " + s.getLong(1) + " index: " + getLeftIndex(s.getLong(2)) + " | " + getRightIndex(s.getLong(2)) + " read " + s.getLong(3) + " count " + s.getLong(4) ); + + if (lastHighest==null){ + lastHighest=s; + mark=1; + } + + if (s.getLong(1) == lastHighest.getLong(1)) { + if (mark==0) { + lastHighest = s; + mark++; + }else if (mark == 1){ + secondHighest = s; + mark++; + }//else{ + // other coverages are not important + //} + + }else{ // new left contig + + if (lastHighest == null) { + lastHighest=s; + }else if (secondHighest==null){ + long[] markerReadInfo = new long[5]; + markerReadInfo[0] =0L; + markerReadInfo[1] = lastHighest.getLong(2); + markerReadInfo[2] = lastHighest.getLong(0); + markerReadInfo[3] = lastHighest.getLong(1); + markerReadInfo[4] = 0; + + long readID= lastHighest.getLong(3); + if (lastHighest.getLong(3)<0){ + readID = -lastHighest.getLong(3); + markerReadInfo[4] = 1; + } + + CCNet.add(RowFactory.create(readID, markerReadInfo)); + // System.out.println("leftContig Uniq: " + readID+ " index " + getLeftIndex(markerReadInfo[1]) + " | " + getRightIndex(markerReadInfo[1]) + " leftContig " + markerReadInfo[2] + " rightContig " + markerReadInfo[3] + " reverse or not " + markerReadInfo[4]); + }else{ // two different left contig + + if (secondHighest.getLong(4) <=4 && lastHighest.getLong(4) / secondHighest.getLong(4) >=3) { + + long[] markerReadInfo = new long[5]; + markerReadInfo[0] = 0L; + markerReadInfo[1] = lastHighest.getLong(2); + markerReadInfo[2] = lastHighest.getLong(0); + markerReadInfo[3] = lastHighest.getLong(1); + markerReadInfo[4] = 0; + + long readID=lastHighest.getLong(3); + if (lastHighest.getLong(3)<0){ + readID = -lastHighest.getLong(3); + markerReadInfo[4] = 1; + } + + CCNet.add(RowFactory.create(readID, markerReadInfo)); + // System.out.println("leftContig Uniq: " + readID+ " index " + getLeftIndex(markerReadInfo[1]) + " | " + getRightIndex(markerReadInfo[1]) + " leftContig " + markerReadInfo[2] + " rightContig " + markerReadInfo[3] + " reverse or not " + markerReadInfo[4]); + } + } + + lastHighest=s; + mark=1; + secondHighest=null; + + } + + } + + if (lastHighest == null) { + // last right contig does not have a match + }else if (secondHighest==null){ + long[] markerReadInfo = new long[5]; + markerReadInfo[0] =0L; + markerReadInfo[1] = lastHighest.getLong(2); + markerReadInfo[2] = lastHighest.getLong(0); + markerReadInfo[3] = lastHighest.getLong(1); + markerReadInfo[4] = 0; + + long readID= lastHighest.getLong(3); + if (lastHighest.getLong(3)<0){ + readID = -lastHighest.getLong(3); + markerReadInfo[4] = 1; + } + + CCNet.add(RowFactory.create(readID, markerReadInfo)); + + // System.out.println("leftContig Uniq: " + readID + " index " + getLeftIndex(markerReadInfo[1]) + " | " + getRightIndex(markerReadInfo[1]) + " leftContig " + markerReadInfo[2] + " rightContig " + markerReadInfo[3] + " reverse or not " + markerReadInfo[4]); + }else{ // two different left contig + + if (secondHighest.getLong(4) <=4 && lastHighest.getLong(4) / secondHighest.getLong(4) >=3) { + + long[] markerReadInfo = new long[5]; + markerReadInfo[0] = 0L; + markerReadInfo[1] = lastHighest.getLong(2); + markerReadInfo[2] = lastHighest.getLong(0); + markerReadInfo[3] = lastHighest.getLong(1); + markerReadInfo[4] = 0; + + long readID= lastHighest.getLong(3); + if (lastHighest.getLong(3)<0){ + readID = -lastHighest.getLong(3); + markerReadInfo[4] = 1; + } + + CCNet.add(RowFactory.create(readID, markerReadInfo)); + // System.out.println("leftContig Uniq: " + readID + " index " + getLeftIndex(markerReadInfo[1]) + " | " + getRightIndex(markerReadInfo[1]) + " leftContig " + markerReadInfo[2] + " rightContig " + markerReadInfo[3] + " reverse or not " + markerReadInfo[4]); + } + } + + return CCNet.iterator(); + } + + private int getLeftIndex(long combinedDuo){ + return (int) (combinedDuo >>> 2*16); + } + + private int getRightIndex(long combinedDuo){ + return (int) combinedDuo; + } + } + + class CCPairsToConnections implements MapPartitionsFunction, Serializable{ + List CCNet = new ArrayList(); + long lastLeftContig=-1; + long lastRightTarget=-1; + long lastIndex=0; + long lastRead=0; + int lastRightCount=1; + long[] targetAndCount; + List rightTargetAndCount = new ArrayList(); + + public Iterator call(Iterator sIterator) throws Exception{ + while (sIterator.hasNext()){ + Row s = sIterator.next(); + + /** + * C1 C2 index read + * C1 C3 index read + * C1 C4 index read + * C2 C5 index read + * C3 C6 index read + */ + + // System.out.println("leftContig: " + s.getLong(0) + " rightContig: " + s.getLong(1) + " index: " + getLeftIndex(s.getLong(2)) + " | " + getRightIndex(s.getLong(2)) + " read " + s.getLong(3)); + + if (lastLeftContig == -1){ + lastLeftContig=s.getLong(0); + lastRightCount=1; + lastRightTarget=s.getLong(1); + lastIndex=s.getLong(2); + lastRead=s.getLong(3); + continue; + } + + if (s.getLong(0) == lastLeftContig) { + if (s.getLong(1) == lastRightTarget) { + lastRightCount++; + } else { + if (lastRightCount>=2) { + targetAndCount = new long[5]; + targetAndCount[0] = lastRightTarget; + targetAndCount[1] = lastRightCount; + targetAndCount[2] = lastIndex; + targetAndCount[3] = lastRead; + targetAndCount[4] = lastLeftContig; + rightTargetAndCount.add(targetAndCount); + } + //rightTargetCounts.add(); + lastRightTarget = s.getLong(1); + lastIndex= s.getLong(2); + lastRead=s.getLong(3); + lastRightCount = 1; + } + }else{ // new left contig + + if (lastRightCount>=2) { + targetAndCount = new long[5]; + targetAndCount[0] = lastRightTarget; + targetAndCount[1] = lastRightCount; + targetAndCount[2] = lastIndex; + targetAndCount[3] = lastRead; + targetAndCount[4] = lastLeftContig; + rightTargetAndCount.add(targetAndCount); + } + // lastRightTarget = s.getLong(1); + // lastRightCount = 1; + + if (rightTargetAndCount.size()>1) { + Collections.sort(rightTargetAndCount, new Comparator() { + @Override + public int compare(long[] o1, long[] o2) { + return o1[1] < o2[1] ? 1 : o1[1] == o2[1] ? 0 : -1; // descending + /// return 0; + } + }); + + long secondHighest = rightTargetAndCount.get(1)[1]; + if (secondHighest <= 4) { + if (rightTargetAndCount.get(0)[1] / secondHighest >= 3) { + + CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[4], rightTargetAndCount.get(0)[0],rightTargetAndCount.get(0)[2],rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1])); + // CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1], markerReadInfo)); + + // System.out.println("lastRead: " + rightTargetAndCount.get(0)[3] + " count " + rightTargetAndCount.get(0)[1] + " index " + getLeftIndex(rightTargetAndCount.get(0)[2]) + " | " + getRightIndex(rightTargetAndCount.get(0)[2]) + " leftContig " + lastLeftContig + " rightContig " + rightTargetAndCount.get(0)[0]); + } + } + }else if (rightTargetAndCount.size()==1){ + + CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[4], rightTargetAndCount.get(0)[0], rightTargetAndCount.get(0)[2],rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1])); + // CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1], markerReadInfo)); + // System.out.println("lastRead: " + rightTargetAndCount.get(0)[3] + " count " + rightTargetAndCount.get(0)[1] + " index " + getLeftIndex(rightTargetAndCount.get(0)[2]) + " | " + getRightIndex(rightTargetAndCount.get(0)[2])+ " leftContig " + lastLeftContig + " rightContig " + rightTargetAndCount.get(0)[0]); + } + + + rightTargetAndCount=new ArrayList(); + lastRightCount=1; + lastRightTarget = s.getLong(1); + lastLeftContig=s.getLong(0); + lastIndex= s.getLong(2); + lastRead= s.getLong(3); + } + + } + + if (lastRightCount>=2) { + targetAndCount = new long[5]; + targetAndCount[0] = lastRightTarget; + targetAndCount[1] = lastRightCount; + targetAndCount[2] = lastIndex; + targetAndCount[3] = lastRead; + targetAndCount[4] = lastLeftContig; + rightTargetAndCount.add(targetAndCount); + } + + //lastRightTarget = s.getLong(1); + // lastRightCount = 1; + + if (rightTargetAndCount.size()>1) { + Collections.sort(rightTargetAndCount, new Comparator() { + @Override + public int compare(long[] o1, long[] o2) { + return o1[1] < o2[1] ? 1 : o1[1] == o2[1] ? 0 : -1; // descending + /// return 0; + } + }); + + long secondHighest = rightTargetAndCount.get(1)[1]; + if (secondHighest <= 4) { + if (rightTargetAndCount.get(0)[1] / secondHighest >= 3) { + + CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[4], rightTargetAndCount.get(0)[0],rightTargetAndCount.get(0)[2],rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1])); + // CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1], markerReadInfo)); + // System.out.println("lastRead: " + rightTargetAndCount.get(0)[3] + " count " + rightTargetAndCount.get(0)[1] + " index " + getLeftIndex(rightTargetAndCount.get(0)[2]) + " | " + getRightIndex(rightTargetAndCount.get(0)[2])+ " leftContig " + rightTargetAndCount.get(0)[4] + " rightContig " + rightTargetAndCount.get(0)[0]); + } + } + }else if (rightTargetAndCount.size() ==1){ + + CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[4], rightTargetAndCount.get(0)[0], rightTargetAndCount.get(0)[2],rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1])); + // CCNet.add(RowFactory.create(rightTargetAndCount.get(0)[3], rightTargetAndCount.get(0)[1], markerReadInfo)); + // System.out.println("lastRead: " + rightTargetAndCount.get(0)[3] + " count " + rightTargetAndCount.get(0)[1] + " index " + getLeftIndex(rightTargetAndCount.get(0)[2]) + " | " + getRightIndex(rightTargetAndCount.get(0)[2])+ " leftContig " + rightTargetAndCount.get(0)[4] + " rightContig " + rightTargetAndCount.get(0)[0]); + } + + return CCNet.iterator(); + } + + private int getLeftIndex(long combinedDuo){ + return (int) (combinedDuo >>> 2*16); + } + + private int getRightIndex(long combinedDuo){ + return (int) combinedDuo; + } + + private boolean compareTwoTargets(long[] target1, long[] target2){ + return true; + } + } + + class CreatCCPairs implements MapPartitionsFunction, Serializable{ + List CCPairs = new ArrayList(); + List leftContigList = new ArrayList(); + List leftIndexList = new ArrayList(); + List rightContigList = new ArrayList(); + List rightIndexList= new ArrayList(); + List contigList = new ArrayList(); + List indexList = new ArrayList(); + long lastRead =0; + long lastContig=0; + + public Iterator call(Iterator sIterator) throws Exception{ + while (sIterator.hasNext()) { + Row s = sIterator.next(); + + + // System.out.println("Read: " + s.getLong(0) + " contig: " + s.getLong(1) + " index: " + s.getInt(2)); + // R1 C1 index + // R1 C1 index + // R1 C2 index + if (s.getLong(0) == lastRead){ + if (s.getLong(1) == lastContig){ + indexList.add(s.getInt(2)); + }else { + int finalIndex; + finalIndex = highestFrequency(indexList); + + + if (finalIndex> -100000000) { + if (finalIndex < 0) { //right contig + rightContigList.add(lastContig); + rightIndexList.add(finalIndex); + } else if (finalIndex > param.maxKmerSize) { // left contig + leftContigList.add(lastContig); + leftIndexList.add(finalIndex); + } + } + + + lastContig = s.getLong(1); + indexList = new ArrayList(); + indexList.add(s.getInt(2)); + } + + + // R1 C1 index + // R2 C2 index + }else{ + + // unload the last read and contig + if (indexList.size()>0){ + int finalIndex; + + finalIndex = highestFrequency(indexList); + + + if (finalIndex> -100000000) { + if (finalIndex < 0) { //right contig + rightContigList.add(lastContig); + rightIndexList.add(finalIndex); + } else if (finalIndex > param.maxKmerSize) { // left contig + leftContigList.add(lastContig); + leftIndexList.add(finalIndex); + } + } + + lastContig = s.getLong(1); + indexList = new ArrayList(); + indexList.add(s.getInt(2)); + }else{ + indexList.add(s.getInt(2)); + } + + // building contig contig pairs + for (int i =0 ;i(); + leftIndexList= new ArrayList(); + rightContigList = new ArrayList(); + rightIndexList= new ArrayList(); + } + } + + // unload the last read and contig + if (indexList.size()>0){ + int finalIndex; + + finalIndex = highestFrequency(indexList); + + // finalIndex = indexList.get(0); + + + if (finalIndex> -100000000) { + if (finalIndex < 0) { //right contig + rightContigList.add(lastContig); + rightIndexList.add(finalIndex); + } else if (finalIndex > param.maxKmerSize) { // left contig + leftContigList.add(lastContig); + leftIndexList.add(finalIndex); + } + } + + // last element, no need to reset + // lastContig = s.getLong(1); + //indexList = null; + // indexList.add(s.getInt(2)); + }else{ + // indexList.add(s.getInt(2)); + } + + for (int i =0 ;i>>= 2*16; + + return ((long)leftIndex) <<2*16 | ForCast ; + } + + private int highestFrequency (List numberlist){ + + if (numberlist.size()==1){ + return numberlist.get(0); + } + + Collections.sort(numberlist); + + int finalIndex=numberlist.get(0); + int lastIndex=numberlist.get(0); + int frequency =1; + int highestFrequency=1; + for (int i=1;i < numberlist.size(); i++){ + + // System.out.println("test highestFrequency: " + numberlist.get(i)); + + if (numberlist.get(i) == lastIndex){ + frequency++; + if (frequency > highestFrequency){ + highestFrequency = frequency; + + finalIndex= numberlist.get(i); + } + }else{ + lastIndex= numberlist.get(i); + frequency=1; + } + } + + // if (highestFrequency<2){ + // return -100000000; + // } + + // System.out.println("highest: " + finalIndex + " frequency: " + highestFrequency); + + return finalIndex; + } + + private int getLeftIndex(long combinedDuo){ + return (int) (combinedDuo >>> 2*16); + } + + private int getRightIndex(long combinedDuo){ + return (int) combinedDuo; + } + } + + class ReadAndContigPairs implements MapPartitionsFunction, Serializable{ + List RACpairs = new ArrayList(); + int lastSeed=0; + int seedKmerSize = 15; + List readList = new ArrayList(); + List indexList = new ArrayList(); + List contigList = new ArrayList(); + List contigIndexList = new ArrayList(); + + boolean emptyContig =false; + + + public Iterator call(Iterator sIterator) throws Exception { + while (sIterator.hasNext()) { + Row s = sIterator.next(); + + int seed = getSeedIntFromLong(s.getLong(0)); + int index = getIndexIntFromLong(s.getLong(0)); + int ROC = getROCIntFromLong(s.getLong(0)); + + // System.out.println("seed: " + BinaryLongToString(s.getLong(0)) + " seedValue" + seed + " index " + index + " Mark " + ROC); + if (seed == lastSeed){ // they same k-mer seed + if (emptyContig){ + continue; + } + + if (ROC ==0){ // contig + contigList.add(s.getLong(1)); + contigIndexList.add(index); + }else { // read + for (int i = 0; i < contigList.size(); i++) { // iterate all contigs + + int relativeIndex = contigIndexList.get(i) - index; // read's relative index on the contig + RACpairs.add(RowFactory.create(s.getLong(1), contigList.get(i), relativeIndex)); + + // System.out.println("Read: " + s.getLong(1) + " contigID: " + contigList.get(i) + " index " + relativeIndex); + + // ---xxxxx-------------------- contig + //-------xxxxx----- read + //| reads relative location on the contig (negative value) + // if (contigIndexList.get(i) < param.maxKmerSize - seedKmerSize) { + // left + // relativeIndex = contigIndexList.get(i) - index; + //} else { + // ------------------xxxxx--- contig + // -----xxxxx-------- read + // |reads relative location + // right + // relativeIndex = contigIndexList.get(i) - index; + // } + } + } + + }else{ // a new k-mer seed + + if (ROC ==1){ // this k-mer only exists in reads not contigs + emptyContig = true; + }else{ + emptyContig = false; + + contigList=new ArrayList(); + contigIndexList=new ArrayList(); + contigList.add(s.getLong(1)); + contigIndexList.add(index); + + lastSeed = seed; + } + } + + lastSeed = seed; + + } + + return RACpairs.iterator(); + } + + private String BinaryLongToString (long binaryBlocks){ // this one has been modified for k-mer 15 + // String KmerString=""; + // int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + int KmerLength = 15; + StringBuilder sb= new StringBuilder(); + char currentNucleotide; + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks>>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + private int getSeedIntFromLong(long compress) { + // xxxxxxxxC|R--index + // xxxxxxxxC + compress >>>= 2*16; + return (int) compress; + } + + private int getIndexIntFromLong(long compress){ + // xxxxxxxxC|R--index + // index + long indexMaxBit = ~((~0L) <<2*15); + compress &=indexMaxBit; + return (int) compress; + } + + private int getROCIntFromLong(long compress){ + // xxxxxxxxC|R--index + // R + + return (int)compress >>> 2*15; + } + + } + + class ExtractMercyKmerFromRead implements MapPartitionsFunction, Serializable{ + List MercyKmer = new ArrayList<>(); + + Row lastRead = null; + Row lastMarker = null; + List lastMarkerArray = new ArrayList(); + long[] seqArray; + + public Iterator call(Iterator sIterator) throws Exception{ + + // ------------ read + // 0-1--- RC marker + // 0+1--- marker + // ------------ read + while (sIterator.hasNext()){ + Row s = sIterator.next(); + + seqArray = seq2array(s.getSeq(1)); + + if (seqArray[0]== 0){ // a marker with ranges , might have two markers because of reverse complement + if (lastRead != null){ + if (lastRead.getLong(0) == s.getLong(0)){ + extractKmer(lastRead.getSeq(1), s.getSeq(1)); + }else{ + lastMarkerArray.add(s); + } + }else { + lastMarkerArray.add(s); + } + }else{ // a read with sequence + for (int i=0; i(); + + lastRead = s; + } + + } + + // the leftover of lastMarkerArray will not find a read anymore + + return MercyKmer.iterator(); + } + + private void extractKmer(Seq markerSeq, Seq readSeq) throws Exception { + long[] markerArray = seq2array(markerSeq); + long[] readArray = seq2array(readSeq); + + for (int i=1; i= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } + + // if (relativeShiftSize ==0) then only shifting blocks + + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary + + j++; + } + + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker + + return newBlock; + + } + + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; + + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; + } + + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + + } + + return shiftOutBlocks; + } + + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private int getLeftMarker(long attribute){ + int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker + int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 + leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + + if (leftMarker>30000){ + leftMarker=30000-leftMarker; + } + + return leftMarker; + } + + private int getRightMarker(long attribute){ + int rightMarker = (int) attribute; + + if (rightMarker>30000){ + rightMarker=30000-rightMarker; + } + + return rightMarker; + } + } + + class RamenReadRangeCal implements MapPartitionsFunction, Serializable{ + List ReadsAndRange = new ArrayList(); + + Row lastKmer=null; + + long[] rangeArray = new long[2]; + + List indices = new ArrayList(); + + public Iterator call(Iterator sIterator) throws Exception{ + while (sIterator.hasNext()){ + Row s = sIterator.next(); + if (lastKmer == null){ + lastKmer = s; + indices.add((int)s.getLong(1)); + }else{ + if (s.getLong(0) == lastKmer.getLong(0)){ + indices.add((int)s.getLong(1)); + }else{ + if (indices.size()>1) { + List ranges = findRange(indices, lastKmer.getLong(0)); + // rangeArray[0] = 0; + // rangeArray[1] = range; + + + if (ranges.size() > 2){ + + if (ranges.get(1) <0){ // reverse complement + ReadsAndRange.add( + RowFactory.create(-lastKmer.getLong(0), ranges) + ); + }else { + ReadsAndRange.add( + RowFactory.create(lastKmer.getLong(0), ranges) + ); + } + } + } + + indices = new ArrayList(); + } + } + } + + if (indices.size()>1){ + List ranges = findRange(indices, lastKmer.getLong(0)); + + if (ranges.size() > 1){ + if (ranges.get(1) <0){ // reverse complement , ranges.get(1) should equal lastKmer.getLong(0) + ReadsAndRange.add( + RowFactory.create(-lastKmer.getLong(0), ranges) + ); + }else { + ReadsAndRange.add( + RowFactory.create(lastKmer.getLong(0), ranges) + ); + } + } + } + + return ReadsAndRange.iterator(); + } + + private List findRange(List i, long index){ + long range=0; + long[] gapsArray; + + List gaps = new ArrayList(); + gaps.add(0L); // add an 0 in the front of the list + gaps.add(index); // for reverse complement detection + + Collections.sort(i); + int lastIndex = i.get(0); + + int a = 0; + int b = 0; + for (int j =1 ; j 1){ + a= lastIndex; + b = j; + + range = buildingAlongFromThreeInt(1, a, b); + gaps.add(range); + } + } + + gapsArray = new long[gaps.size()]; + + for (int k=0; k< gaps.size(); k++){ + + } + + return gaps; + } + + private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ + long info = (long) ReflexivMarker <<2*(32-1); //move to the left most + + /** + * shorten the int and change negative to positive to avoid two's complementary + */ + if (leftCover>=30000){ + leftCover=30000; + }else if (leftCover<=-30000){ + leftCover=30000-(-30000); + }else if (leftCover<0){ + leftCover=30000-leftCover; + } + + if (rightCover>=30000){ + rightCover=30000; + }else if (rightCover<=-30000){ + rightCover=30000-(-30000); + }else if (rightCover<0){ + rightCover=30000-rightCover; + } + + info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left + info |= ((long) rightCover); // 01--LeftCover---RightCover + + return info; + } + + + } + + class RamenReadExtraction implements MapPartitionsFunction, Serializable{ + List ReadsAndIndices = new ArrayList(); + + Row lastKmer=null; + + List ReadsKmerBuffer= new ArrayList(); + + public Iterator call(Iterator sIterator) throws Exception{ + + while (sIterator.hasNext()){ + Row s = sIterator.next(); + + // ------- k-mer + // ------- read + // ------- read + // ------- k-mer + if (s.getLong(2) == -1){ // a more than 2x coverage k-mer + if (ReadsKmerBuffer.size()>0){ + + for (int i =0; i< ReadsKmerBuffer.size();i++){ + if (dynamicSubKmerComparator(ReadsKmerBuffer.get(i).getSeq(0), s.getSeq(0)) == true){ + ReadsAndIndices.add( + RowFactory.create(ReadsKmerBuffer.get(i).getLong(1), ReadsKmerBuffer.get(i).getLong(2)) + ); + } + } + + ReadsKmerBuffer = new ArrayList(); + } + + lastKmer = s; + }else{ // a read k-mer + if (lastKmer !=null){ + if (dynamicSubKmerComparator(lastKmer.getSeq(0), s.getSeq(0)) == true){ + ReadsAndIndices.add( + RowFactory.create(s.getLong(1), s.getLong(2)) + ); + }else { + // --------- read 1 k-mer + // --------- read 2 k-mer different + // ----- + if (ReadsKmerBuffer.size()>0) { + if (dynamicSubKmerComparator(ReadsKmerBuffer.get(ReadsKmerBuffer.size() - 1).getSeq(0), s.getSeq(0)) == true) { + ReadsKmerBuffer.add(s); + } else { + ReadsKmerBuffer = new ArrayList(); + ReadsKmerBuffer.add(s); + } + }else{ + ReadsKmerBuffer.add(s); + } + } + }else{ + ReadsKmerBuffer.add(s); + } + } + + } + + return ReadsAndIndices.iterator(); + } + + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; + + if (shiftingLength > nucleotideLength){ + return blocks; + // throw new Exception("shifting length longer than the kmer length"); + } + + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + + } + + return shiftOutBlocks; + } + + + private boolean dynamicSubKmerComparator(Seq a, Seq b) throws Exception { + long[] arrayA = seq2array(a); + long[] arrayB = seq2array(b); + + int aLength= currentKmerSizeFromBinaryBlockArray(arrayA); + int bLength= currentKmerSizeFromBinaryBlockArray(arrayB); + + if (aLength>bLength){ // equal should not happen + long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); + // String longer = BinaryBlocksToString(shorterVersion); + // String shorter = BinaryBlocksToString(arrayB); + // System.out.println("longer: " + longer + " shorter: " + shorter); + // if (shorterVersion.length>=2 && arrayB.length >=2) { + // System.out.println("longer array: " + shorterVersion[0] + " " + shorterVersion[1] + " shorter array: " + arrayB[0] + " " + arrayB[1]); + //} + if (Arrays.equals(shorterVersion, arrayB)){ + // if (shorterVersion.length>=2){ + // System.out.println("marker!!!"); + // } + return true; + }else{ + return false; + } + }else{ + long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); + if (Arrays.equals(shorterVersion, arrayA)){ + return true; + }else{ + return false; + } + } + } + } + + class ContigKmerMarkerExtraction implements MapPartitionsFunction, Serializable{ + List SeedKmerList = new ArrayList(); + long[] fullKmerArray; + long contigID; + int kmerLength; + int SeedKmerSize =15; + long maxKmerBinary =(~0L) << 2 * (32-SeedKmerSize); + + public Iterator call(Iterator sIterator) throws Exception { + while (sIterator.hasNext()) { + Row s = sIterator.next(); + //fullKmerArray = seq2array(s.getSeq(1)); + fullKmerArray= (long[]) s.get(1); + contigID=s.getLong(0); + kmerLength = currentKmerSizeFromBinaryBlockArray(fullKmerArray); + + // System.out.println("contig: " + BinaryBlocksToString(fullKmerArray)); + + if (kmerLength >=2*param.maxKmerSize){ + long[] fixedKmerLeft; + long[] fixedKmerRight; + + for (int i=0;i>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + // String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + StringBuilder sb= new StringBuilder(); + char currentNucleotide; + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + private long buildingAlongForCompression(long kmer, int index, int ROC){ // ROC read or contig + // xxxxxxxxxC|R----index assuming contig length smaller than 1G + + long ROCLong = (long) ROC << 2*15; + kmer|= ROCLong; + return kmer|(long) index; + } + + private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ + int kmerSize; + int blockSize = binaryBlocks.length; + kmerSize= (blockSize-1) *31; + final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- + int lastMers = Long.SIZE/2-suffix0s/2-1; + + kmerSize+=lastMers; + return kmerSize; + + } + + + private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { + int startingBlockIndex = (shiftingLength)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length + + int remainLength=nucleotideLength-shiftingLength-1; + if (remainLength <0){ + remainLength=0; + } + long[] newBlock = new long[remainLength/31+1]; + int relativeShiftSize = shiftingLength % 31; + + if (shiftingLength >= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } + + // if (relativeShiftSize ==0) then only shifting blocks + + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary + + j++; + } + + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker + + return newBlock; + + } + + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; + + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; + } + + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + + } + + return shiftOutBlocks; + } + + } + + class TagRowContigID implements FlatMapFunction, String>, Serializable { + + public Iterator call(Tuple2 s) { + + + List contigList = new ArrayList(); + + contigList.add(s._1.getString(0) + "-" + s._2 + "\n" + s._1.getString(1)); + + return contigList.iterator(); + } + } + + class TagContigID implements FlatMapFunction, Long>, String>, Serializable { + + public Iterator call(Tuple2, Long> s) { + + + List contigList = new ArrayList(); + + contigList.add(s._1._1 + "-" + s._2 + "\n" + s._1._2); + + return contigList.iterator(); + } + } + + class DSKmerToContig implements MapPartitionsFunction, Serializable { + + public Iterator call(Iterator sIterator) { + List contigList = new ArrayList(); + + while (sIterator.hasNext()) { + Row s = sIterator.next(); + if (getReflexivMarker(s.getLong(1)) == 1) { + String contig = s.getString(0) + s.getString(2); + int length = contig.length(); + if (length >= param.minContig) { + String ID = ">Contig-" + length + "-" + getLeftMarker(s.getLong(1)) + "-" + getRightMarker(s.getLong(1)); + String formatedContig = changeLine(contig, length, 10000000); + contigList.add(RowFactory.create(ID, formatedContig)); + } + } else { // (randomReflexivMarker == 2) { + String contig = s.getString(2) + s.getString(0); + int length = contig.length(); + if (length >= param.minContig) { + String ID = ">Contig-" + length + "-" + getLeftMarker(s.getLong(1)) + "-" + getRightMarker(s.getLong(1)); + String formatedContig = changeLine(contig, length, 10000000); + contigList.add(RowFactory.create(ID, formatedContig)); + } + } + } + + return contigList.iterator(); + } + + public String changeLine(String oneLine, int lineLength, int limitedLength) { + String blockLine = ""; + int fold = lineLength / limitedLength; + int remainder = lineLength % limitedLength; + if (fold == 0) { + blockLine = oneLine; + } else if (fold == 1 && remainder == 0) { + blockLine = oneLine; + } else if (fold > 1 && remainder == 0) { + for (int i = 0; i < fold - 1; i++) { + blockLine += oneLine.substring(i * limitedLength, (i + 1) * limitedLength) + "\n"; + } + blockLine += oneLine.substring((fold - 1) * limitedLength); + } else { + for (int i = 0; i < fold; i++) { + blockLine += oneLine.substring(i * limitedLength, (i + 1) * limitedLength) + "\n"; + } + blockLine += oneLine.substring(fold * limitedLength); + } + + return blockLine; + } + + private int getReflexivMarker(long attribute){ + int reflexivMarker = (int) (attribute >>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker + return reflexivMarker; + } + + private int getLeftMarker(long attribute){ + int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker + int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 + leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + + if (leftMarker>30000){ + leftMarker=30000-leftMarker; + } + + return leftMarker; + } + + private int getRightMarker(long attribute){ + int rightMarker = (int) attribute; + + if (rightMarker>30000){ + rightMarker=30000-rightMarker; + } + + return rightMarker; + } + + } + + class DSExtendConnectableContigLoop implements MapPartitionsFunction, Serializable { + + /* marker to identify similar SubKmers in the loop sequence */ + private int lineMarker = 1; + + /* 1 stands for forward sub-kmer */ + /* 2 stands for reflexiv sub-kmer */ + // private int randomReflexivMarker = ThreadLocalRandom.current().nextInt(1, 3); + private int randomReflexivMarker = 2; + + + + + long maxSubKmerResidueBinary = ~((~0L) << 2 * param.subKmerSizeResidue); + long maxSubKmerBinary = ~((~0L) << 2 * 31); + + + //long maxBlockBinary = ~((~0L) << 2*31); // a block has 31 nucleotide + + + /* temporary capsule to store identical SubKmer units */ + List tmpReflexivKmerExtendList = new ArrayList(); + + /* return capsule of extend Tuples for next iteration*/ + List reflexivKmerConcatList = new ArrayList(); + + /** + * + * @param sIterator is the input data structure Tuple2> + * s._1 represents sub kmer sequence + * s._2._1 represents sub kmer marker: 1, for forward sub kmer; + * 2, for reverse (reflexiv) sub kmer; + * s._2._2 represents the rest sequence. + * s._2._2 represents the coverage of the K-mer + * @return a list of extended Tuples for next iteration + */ + public Iterator call(Iterator sIterator) throws Exception { + + if (param.scramble ==3){ + randomReflexivMarker =1; + } + + while (sIterator.hasNext()) { + Row s = sIterator.next(); + + /* receive the first sub-kmer, set new units */ + if (lineMarker == 1) { + resetSubKmerGroup(s); + + // return reflexivKmerConcatList.iterator(); + } + + /* removal condition */ + /** + * Deprecated function for killer k-mers + */ + + /* next element of RDD */ + else {/* if (lineMarker >= 2){ */ + /* initiate a new capsule for the current sub-kmer group */ + // reflexivKmerConcatList = new ArrayList>>(); + + if (tmpReflexivKmerExtendList.size() == 0) { + directKmerComparison(s); + } else { /* tmpReflexivKmerExtendList.size() != 0 */ + for (int i = 0; i < tmpReflexivKmerExtendList.size(); i++) { // the tmpReflexivKmerExtendList is changing dynamically + if (subKmerSlotComparator(s.getSeq(0), tmpReflexivKmerExtendList.get(i).getSeq(0)) || dynamicSubKmerComparator(s.getSeq(0), tmpReflexivKmerExtendList.get(i).getSeq(0))) { + // System.out.println("loop array extend. first leftMarker: " + getLeftMarker(s.getLong(1)) + " rightMarker: " + getRightMarker(s.getLong(1)) + " second leftMarker: " + getLeftMarker(tmpReflexivKmerExtendList.get(i).getLong(1)) + " rightMarker: " + getRightMarker(tmpReflexivKmerExtendList.get(i).getLong(1))); + if (getReflexivMarker(s.getLong(1))== 1) { + if (getReflexivMarker(tmpReflexivKmerExtendList.get(i).getLong(1)) == 2) { + // residue length + int tmpReflexivKmerSuffixLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros((Long) tmpReflexivKmerExtendList.get(i).getSeq(2).apply(tmpReflexivKmerExtendList.get(i).getSeq(2).size()-1)) / 2 + 1); + // extended overall length + int tmpBlockSize = (tmpReflexivKmerExtendList.get(i).getSeq(2).length() - 1) * 31 + tmpReflexivKmerSuffixLength; + int currentReflexivKmerSuffixLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros((Long) s.getSeq(2).apply(s.getSeq(2).size()-1)) / 2 + 1); + int currentBlockSize = (s.getSeq(2).length() - 1) * 31 + currentReflexivKmerSuffixLength; + + + int lengthS = currentKmerSizeFromBinaryBlockArray(seq2array(s.getSeq(0))); + int lengthTemp= currentKmerSizeFromBinaryBlockArray(seq2array(tmpReflexivKmerExtendList.get(i).getSeq(0))); + + int extraLength=0; + if (lengthTemp< lengthS){ + extraLength=lengthS-lengthTemp; + } + + if (lengthS=10){ // only for read based patching, too long contig is not going to connect with shorter contig + // singleKmerRandomizer(s); + // break; + // } + else if (lengthTemp <=500){ // only for read based patching, too long contig is not going to connect with shorter contig + singleKmerRandomizer(s); + break; + } + else if (getLeftMarker(s.getLong(1))< 0 && getRightMarker(tmpReflexivKmerExtendList.get(i).getLong(1))< 0) { + reflexivExtend(s, tmpReflexivKmerExtendList.get(i), -1); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else if (getLeftMarker(s.getLong(1)) >= 0 && getRightMarker(tmpReflexivKmerExtendList.get(i).getLong(1))>= 0) { + reflexivExtend(s, tmpReflexivKmerExtendList.get(i), -1); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else if (getLeftMarker(s.getLong(1)) >= 0 && getLeftMarker(s.getLong(1)) - tmpBlockSize >= 0) { + reflexivExtend(s, tmpReflexivKmerExtendList.get(i), getLeftMarker(s.getLong(1))- tmpBlockSize); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else if (getRightMarker(tmpReflexivKmerExtendList.get(i).getLong(1))>= 0 && getRightMarker(tmpReflexivKmerExtendList.get(i).getLong(1))- currentBlockSize -extraLength >= 0) { + reflexivExtend(s, tmpReflexivKmerExtendList.get(i), getRightMarker(tmpReflexivKmerExtendList.get(i).getLong(1))- currentBlockSize); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else { + singleKmerRandomizer(s); + break; + } + } else if (getReflexivMarker(tmpReflexivKmerExtendList.get(i).getLong(1))== 1) { + singleKmerRandomizer(s); + //directKmerComparison(s); + break; + } + } else { /* if (s.getInt(1) == 2) { */ + if (getReflexivMarker(tmpReflexivKmerExtendList.get(i).getLong(1))== 2) { + singleKmerRandomizer(s); + //directKmerComparison(s); + break; + } else if (getReflexivMarker(tmpReflexivKmerExtendList.get(i).getLong(1))== 1) { + // residue length + int tmpReflexivKmerSuffixLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros((Long) tmpReflexivKmerExtendList.get(i).getSeq(2).apply(tmpReflexivKmerExtendList.get(i).getSeq(2).size()-1)) / 2 + 1); + // extended overall length + int tmpBlockSize = (tmpReflexivKmerExtendList.get(i).getSeq(2).length() - 1) * 31 + tmpReflexivKmerSuffixLength; + int currentReflexivKmerSuffixLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros((Long) s.getSeq(2).apply(s.getSeq(2).size()-1)) / 2 + 1); + int currentBlockSize = (s.getSeq(2).length() - 1) * 31 + currentReflexivKmerSuffixLength; + + int lengthS = currentKmerSizeFromBinaryBlockArray(seq2array(s.getSeq(0))); + int lengthTemp= currentKmerSizeFromBinaryBlockArray(seq2array(tmpReflexivKmerExtendList.get(i).getSeq(0))); + + int extraLength=0; + if (lengthS< lengthTemp){ + extraLength=lengthTemp-lengthS; + } + + if (lengthTemp=10){ // only for read based patching, too long contig is not going to connect with shorter contig + // singleKmerRandomizer(s); + // break; + // } + else if (lengthS <=500){ // only for read based patching, too long contig is not going to connect with shorter contig + singleKmerRandomizer(s); + break; + } + else if (getRightMarker(s.getLong(1)) < 0 && getLeftMarker(tmpReflexivKmerExtendList.get(i).getLong(1))< 0) { + reflexivExtend(tmpReflexivKmerExtendList.get(i), s, -1); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else if (getRightMarker(s.getLong(1)) >= 0 && getLeftMarker(tmpReflexivKmerExtendList.get(i).getLong(1)) >= 0) { + reflexivExtend(tmpReflexivKmerExtendList.get(i), s, -1); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else if (getRightMarker(s.getLong(1))>= 0 && getRightMarker(s.getLong(1))- tmpBlockSize -extraLength>= 0) { + reflexivExtend(tmpReflexivKmerExtendList.get(i), s, getRightMarker(s.getLong(1))- tmpBlockSize); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else if (getLeftMarker(tmpReflexivKmerExtendList.get(i).getLong(1))>= 0 && getLeftMarker(tmpReflexivKmerExtendList.get(i).getLong(1)) - currentBlockSize >= 0) { + reflexivExtend(tmpReflexivKmerExtendList.get(i), s, getLeftMarker(tmpReflexivKmerExtendList.get(i).getLong(1)) - currentBlockSize); + tmpReflexivKmerExtendList.remove(i); /* already extended */ + break; + } else { + singleKmerRandomizer(s); + break; + } + } + } + /* return reflexivKmerConcatList.iterator(); */ + } + + //else if (dynamicSubKmerComparator(s.getSeq(0), tmpReflexivKmerExtendList.get(i).getSeq(0))){ + + // } + + /* new Sub-kmer group section */ + else { /* s.getLong(0) != tmpReflexivKmerExtendList.get(i).getLong(0)()*/ + // if (lineMarker == 2) { // lineMarker == 2 represents the second line of the partition + // singleKmerRandomizer(tmpReflexivKmerExtendList.get(i)); + // } + // singleKmerRandomizer(s); + tmpKmerRandomizer(); + resetSubKmerGroup(s); + break; + } + } /* end of the while loop */ + }// end of else condition + + lineMarker++; + // return reflexivKmerConcatList.iterator(); + } + } // while loop + tmpKmerRandomizer(); + return reflexivKmerConcatList.iterator(); + } + + /** + * + * @param currentSubKmer + */ + public void singleKmerRandomizer(Row currentSubKmer) throws Exception { + long[] currentSubKmerArray = seq2array(currentSubKmer.getSeq(0)); + long[] currentReflexivArray = seq2array(currentSubKmer.getSeq(2)); + + if (getReflexivMarker(currentSubKmer.getLong(1)) == 1) { + /** + * 00000000000000110010111010010 Long.SIZE + * --------------C-G-G-G-T-C-A-G Long.SIZE - (Long.numberOfLeadingZeros / 2 + 1) + * --------------^-Length marker + */ + int currentSuffixLength = currentKmerSizeFromBinaryBlockArray(currentReflexivArray); // Long.SIZE / 2 - (Long.numberOfTrailingZeros(currentSubKmer.getLong(2)) / 2 + 1); // xx01------- + long[] newReflexivSubKmer; //= new long[param.subKmerBinarySlots]; + // long newReflexivLong; + + if (randomReflexivMarker == 2) { + + long[] combinedKmerArray = combineTwoLongBlocks(currentSubKmerArray, currentReflexivArray); + + newReflexivSubKmer = leftShiftArray(combinedKmerArray, currentSuffixLength); + + long[] newReflexivLongArray = leftShiftOutFromArray(combinedKmerArray, currentSuffixLength); + + long attribute = onlyChangeReflexivMarker(currentSubKmer.getLong(1), randomReflexivMarker); + + reflexivKmerConcatList.add( + RowFactory.create(newReflexivSubKmer, attribute, newReflexivLongArray) + ); + + } else { + reflexivKmerConcatList.add(currentSubKmer); + } + } else { /* currentSubKmer._2._1() == 2 */ + long[] newReflexivSubKmer; // = new long[param.subKmerBinarySlots]; + + int currentSubKmerSize= currentKmerSizeFromBinaryBlockArray(currentSubKmerArray); + + if (randomReflexivMarker == 2) { + reflexivKmerConcatList.add(currentSubKmer); + } else { /* randomReflexivMarker == 1 */ + + long[] combinedKmerArray = combineTwoLongBlocks(currentReflexivArray, currentSubKmerArray); + + newReflexivSubKmer= leftShiftOutFromArray(combinedKmerArray, currentSubKmerSize); + long[] newReflexivLongArray= leftShiftArray(combinedKmerArray, currentSubKmerSize); + + long attribute = onlyChangeReflexivMarker(currentSubKmer.getLong(1), randomReflexivMarker); + + + reflexivKmerConcatList.add( + RowFactory.create(newReflexivSubKmer, attribute, newReflexivLongArray) + ); + } + + } + + /* an action of randomization */ + + if (randomReflexivMarker == 1) { + randomReflexivMarker = 2; + } else { /* randomReflexivMarker == 2 */ + randomReflexivMarker = 1; + } + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0L) { + nucleotide = 'A'; + } else if (twoBits == 1L) { + nucleotide = 'C'; + } else if (twoBits == 2L) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + private boolean dynamicSubKmerComparator(Seq a, Seq b) throws Exception { + long[] arrayA = seq2array(a); + long[] arrayB = seq2array(b); + + int aLength= currentKmerSizeFromBinaryBlockArray(arrayA); + int bLength= currentKmerSizeFromBinaryBlockArray(arrayB); + + // String arrayAString = BinaryBlocksToString(arrayA); + // String arrayBString = BinaryBlocksToString(arrayB); + + // System.out.println("different comparator: " + arrayAString + " B: " + arrayBString); + + if (aLength>=bLength){ // equal should not happen + long[] shorterVersion = leftShiftOutFromArray(arrayA, bLength); + if (Arrays.equals(shorterVersion, arrayB)){ + return true; + }else{ + return false; + } + }else{ + long[] shorterVersion = leftShiftOutFromArray(arrayB, aLength); + if (Arrays.equals(shorterVersion, arrayA)){ + return true; + }else{ + return false; + } + } + } + + /** + * + * @param currentSubKmer + */ + public void directKmerComparison(Row currentSubKmer) { + tmpReflexivKmerExtendList.add(currentSubKmer); + } + + /** + * + * @param forwardSubKmer + * @param reflexedSubKmer + */ + + public void reflexivExtend(Row forwardSubKmer, Row reflexedSubKmer, int bubbleDistance) throws Exception { + + /* forward ATCGATCG, 1, ------ */ + /* reflexed ------, 2, ATCGATCG */ + + int forwardSuffixLength = currentKmerSizeFromBinaryBlockArray(seq2array(forwardSubKmer.getSeq(2))); + int forwardSubKmerLength = currentKmerSizeFromBinaryBlockArray(seq2array(forwardSubKmer.getSeq(0))); + + int reflexedPrefixLength = currentKmerSizeFromBinaryBlockArray(seq2array(reflexedSubKmer.getSeq(2))); + int reflexedSubKmerLength = currentKmerSizeFromBinaryBlockArray(seq2array(reflexedSubKmer.getSeq(0))); + + + int newSubKmerLength; + long[] longerSubKmer; + + int extraLength=0; + if (forwardSubKmerLength>reflexedSubKmerLength){ + extraLength=forwardSubKmerLength-reflexedSubKmerLength; + } + + if (forwardSubKmerLength >= reflexedSubKmerLength){ // In reality, it is always forwardSubKmer longer than or equal to reflexedSubKmer + newSubKmerLength=forwardSubKmerLength; + longerSubKmer=seq2array(forwardSubKmer.getSeq(0)); + }else{ + newSubKmerLength=reflexedSubKmerLength; + longerSubKmer=seq2array(reflexedSubKmer.getSeq(0)); + } + + long[] reflexedPrefixArray = seq2array(reflexedSubKmer.getSeq(2)); + long[] forwardSuffixArray = seq2array(forwardSubKmer.getSeq(2)); + long attribute = 0; + + + if (randomReflexivMarker == 2) { + + long[] newReflexivSubKmer = combineTwoLongBlocks(longerSubKmer, forwardSuffixArray); // xxxxx xxxxx xxx-- + xxx--- = xxxxx xxxxx xxxxx x---- + long[] newReflexivLongArray= leftShiftOutFromArray(newReflexivSubKmer, forwardSuffixLength); // xxx-- | ---xx xxxxx xxxxx x---- + + newReflexivSubKmer = leftShiftArray(newReflexivSubKmer, forwardSuffixLength); // xxxxx xxxxx xxx--- + newReflexivLongArray = combineTwoLongBlocks(reflexedPrefixArray, newReflexivLongArray); // xx--- + xxx-- + + if (bubbleDistance < 0) { + + int left=0; + int right=0; + if (getLeftMarker(reflexedSubKmer.getLong(1))>=0){ + left = getLeftMarker(reflexedSubKmer.getLong(1)); + }else{ + left= getLeftMarker(forwardSubKmer.getLong(1))-reflexedPrefixLength; + } + + if (getRightMarker(forwardSubKmer.getLong(1))>=0){ + right = getRightMarker(forwardSubKmer.getLong(1)); + }else { + right = getRightMarker(reflexedSubKmer.getLong(1))-forwardSuffixLength-extraLength; + } + + attribute = buildingAlongFromThreeInt(randomReflexivMarker, left, right); + reflexivKmerConcatList.add( + RowFactory.create(newReflexivSubKmer, + attribute, newReflexivLongArray + ) + ); + } else { + if (getLeftMarker(forwardSubKmer.getLong(1)) > 0) { + if (getRightMarker(forwardSubKmer.getLong(1)) >=0) { + attribute = buildingAlongFromThreeInt(randomReflexivMarker, bubbleDistance, getRightMarker(forwardSubKmer.getLong(1))); + }else{ + attribute= buildingAlongFromThreeInt(randomReflexivMarker, bubbleDistance, getRightMarker(reflexedSubKmer.getLong(1))-forwardSuffixLength-extraLength); + } + reflexivKmerConcatList.add( + RowFactory.create(newReflexivSubKmer, + attribute, newReflexivLongArray + ) + ); + } else { // reflexedSubKmer right >0 + if (getLeftMarker(reflexedSubKmer.getLong(1))>=0) { + attribute = buildingAlongFromThreeInt(randomReflexivMarker, getLeftMarker(reflexedSubKmer.getLong(1)), bubbleDistance-extraLength); + }else{ + attribute = buildingAlongFromThreeInt(randomReflexivMarker, getLeftMarker(forwardSubKmer.getLong(1))-reflexedPrefixLength, bubbleDistance-extraLength); + } + reflexivKmerConcatList.add( + RowFactory.create(newReflexivSubKmer, + attribute, newReflexivLongArray + ) + ); + } + } + + // String newReflexivSubKmerString = BinaryBlocksToString(newReflexivSubKmer); + // String newReflexivLongArrayString = BinaryBlocksToString(newReflexivLongArray); + + // System.out.println("Prefix " + newReflexivLongArrayString + " combined: " + newReflexivSubKmerString + " reflexivMarker: " + getReflexivMarker(attribute) + " leftMarker: " + getLeftMarker(attribute) + " rightMarker: " + getRightMarker(attribute)); + + + randomReflexivMarker = 1; /* an action of randomization */ + } else { /* randomReflexivMarker == 1 */ + + long[] newForwardSubKmer = combineTwoLongBlocks(reflexedPrefixArray, longerSubKmer); // xx--- + xxxxx xxxxx xx--- = xxxxx xxxxx xxxx- + long[] newForwardLongArray = leftShiftArray(newForwardSubKmer, newSubKmerLength); // xxxxx xxxxx xxxx- -> xx-- + + newForwardSubKmer = leftShiftOutFromArray(newForwardSubKmer, newSubKmerLength); // xxxxx xxxxx xxxx- -> xxxxx xxxxx xx---|xx- + newForwardLongArray = combineTwoLongBlocks(newForwardLongArray, forwardSuffixArray); // xx-- + xxx-- -> xxxxx + + if (bubbleDistance < 0) { + int left=0; + int right=0; + if (getLeftMarker(reflexedSubKmer.getLong(1))>=0){ + left = getLeftMarker(reflexedSubKmer.getLong(1)); + }else{ + left= getLeftMarker(forwardSubKmer.getLong(1))-reflexedPrefixLength; + } + + if (getRightMarker(forwardSubKmer.getLong(1))>=0){ + right = getRightMarker(forwardSubKmer.getLong(1)); + }else { + right = getRightMarker(reflexedSubKmer.getLong(1))-forwardSuffixLength-extraLength; + } + + attribute = buildingAlongFromThreeInt(randomReflexivMarker, left, right); + reflexivKmerConcatList.add( + RowFactory.create(newForwardSubKmer, + attribute, newForwardLongArray + ) + ); + } else { + + if (getLeftMarker(forwardSubKmer.getLong(1)) > 0) { + if (getRightMarker(forwardSubKmer.getLong(1)) >=0) { + attribute = buildingAlongFromThreeInt(randomReflexivMarker, bubbleDistance, getRightMarker(forwardSubKmer.getLong(1))); + }else{ + attribute= buildingAlongFromThreeInt(randomReflexivMarker, bubbleDistance, getRightMarker(reflexedSubKmer.getLong(1))-forwardSuffixLength-extraLength); + } + reflexivKmerConcatList.add( + RowFactory.create(newForwardSubKmer, + attribute, newForwardLongArray + ) + ); + } else { // reflexedSubKmer.getInt(4) >0 + if (getLeftMarker(reflexedSubKmer.getLong(1))>=0) { + attribute = buildingAlongFromThreeInt(randomReflexivMarker, getLeftMarker(reflexedSubKmer.getLong(1)), bubbleDistance-extraLength); + }else{ + attribute = buildingAlongFromThreeInt(randomReflexivMarker, getLeftMarker(forwardSubKmer.getLong(1))-reflexedPrefixLength, bubbleDistance-extraLength); + } + reflexivKmerConcatList.add( + RowFactory.create(newForwardSubKmer, + attribute, newForwardLongArray + ) + ); + } + } + + // String newForwardSubKmerString = BinaryBlocksToString(newForwardSubKmer); + // String newForwardLongArrayString = BinaryBlocksToString(newForwardLongArray); + + // System.out.println("After combine: " + newForwardSubKmerString + " suffix: " + newForwardLongArrayString + " reflexivMarker: " + getReflexivMarker(attribute) + " leftMarker: " + getLeftMarker(attribute) + " rightMarker: " + getRightMarker(attribute)); + + randomReflexivMarker = 2; + } + + /* add current sub kmer to temporal storage */ + // tmpReflexivKmerExtendList.add(reflexedSubKmer); + } + + /** + * + * @param S + */ + public void resetSubKmerGroup(Row S) { + if (lineMarker == 1) { + lineMarker = 2; + } else { + lineMarker = 3; /* reset to new sub-kmer group */ + } + /* re-reflex all single kmers in the sub-kmer group */ +// if (tmpReflexivKmerExtendList.size() != 0) { +// for (int i = 0; i < tmpReflexivKmerExtendList.size(); i++) { + // singleKmerRandomizer(tmpReflexivKmerExtendList.get(i)); + // } + // } + + tmpReflexivKmerExtendList = new ArrayList(); + tmpReflexivKmerExtendList.add(S + // RowFactory.create(S.getLong(0), + // S.getInt(1), S.get(2), S.getInt(3), S.getInt(4) + // ) + ); + } + + private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exception { + int startingBlockIndex = (shiftingLength)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + int residueLength = Long.SIZE / 2 - (Long.numberOfTrailingZeros(blocks[blocks.length-1])/2+1); // last block length + + int remainLength=nucleotideLength-shiftingLength-1; + if (remainLength <0){ + remainLength=0; + } + long[] newBlock = new long[remainLength/31+1]; + int relativeShiftSize = shiftingLength % 31; + + if (shiftingLength >= nucleotideLength){ + // apparantly, it is possible. meaning the block has nothing left + // throw new Exception("shifting length longer than the kmer length"); + newBlock[0]|=(1L<<2*31); //add c marker at the end + return newBlock; + } + + // if (relativeShiftSize ==0) then only shifting blocks + + int j=0; // new index for shifted blocks + // long oldShiftOut=0L; // if only one block, then 0 bits +// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex +// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); + // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted + newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- + newBlock[j] |= shiftOut; + newBlock[j] &= (~0L<<2); // remove the last two bits, in case of overlength xxxxxxxxxxx - > xxxxxxxxxxx- C marker will be added later if necessary + + j++; + } + + if (residueLength > relativeShiftSize){ // still some nucleotide left in the last block + newBlock[j]= blocks[blocks.length-1] << 2*relativeShiftSize; + }else if (residueLength == relativeShiftSize){ // nothing left in the last block, but the new last block needs a C marker in the end + newBlock[j-1] |= 1L; // j-1 == newBlock.length-1 + } // else the last block has been completely shift into the new last block, including the C marker + + return newBlock; + + } + + private long[] leftShiftOutFromArray(long[] blocks, int shiftingLength) throws Exception{ + int relativeShiftSize = shiftingLength % 31; + int endingBlockIndex = (shiftingLength-1)/31; + int nucleotideLength = currentKmerSizeFromBinaryBlockArray(blocks); + long[] shiftOutBlocks = new long[endingBlockIndex+1]; + + if (shiftingLength > nucleotideLength){ + // throw new Exception("shifting length longer than the kmer length"); + return blocks; + } + + for (int i=0; i 0) { + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex] & (~0L << 2 * (32 - relativeShiftSize)); // 1111111100000000000 + shiftOutBlocks[endingBlockIndex] |= (1L << (2 * (32 - relativeShiftSize - 1))); + }else{ // relativeShiftSize == 0; + if (endingBlockIndex+1 == blocks.length) { // a block with C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + }else{ // endingBlockIndex < blocks.length -1 means a block without C marker + shiftOutBlocks[endingBlockIndex] = blocks[endingBlockIndex]; + shiftOutBlocks[endingBlockIndex]|=1L; // adding C marker in the end xxxxxxxxxC + } + + } + + return shiftOutBlocks; + } + + private long[] combineTwoLongBlocks(long[] leftBlocks, long[] rightBlocks) throws Exception { + int leftNucleotideLength = currentKmerSizeFromBinaryBlockArray(leftBlocks); + int leftRelativeNTLength = (leftNucleotideLength-1) % 31+1; + int leftVacancy = 31-leftRelativeNTLength; + int rightNucleotideLength = currentKmerSizeFromBinaryBlockArray(rightBlocks); + int combinedBlockSize = (leftNucleotideLength+rightNucleotideLength-1)/31+1; + long[] newBlocks= new long[combinedBlockSize]; + + if (rightNucleotideLength==0){ + return leftBlocks; + } + + if (leftNucleotideLength==0){ + return rightBlocks; + } + + if (leftVacancy ==0){ // left last block is a perfect block + for (int i =0; i>> 2*(leftRelativeNTLength)); + if (leftBlocks.length>> 2*(32-1)); // 01-------- -> ---------01 reflexiv marker + return reflexivMarker; + } + + private int getLeftMarker(long attribute){ + int leftMarker = (int) (attribute >>> 2*(16)); // 01--xxxx-----xxxx -> 01--xxxx shift out right marker + int leftMarkerBinaryBits= ~(3 << 30) ; // ---------11 -> 11---------- -> 0011111111111 + leftMarker &= leftMarkerBinaryBits; // remove reflexivMarker + + if (leftMarker>30000){ + leftMarker=30000-leftMarker; + } + + return leftMarker; + } + + private int getRightMarker(long attribute){ + int rightMarker = (int) attribute; + + if (rightMarker>30000){ + rightMarker=30000-rightMarker; + } + + return rightMarker; + } + + private long buildingAlongFromThreeInt(int ReflexivMarker, int leftCover, int rightCover){ + long info = (long) ReflexivMarker <<2*(32-1); //move to the left most + + /** + * shorten the int and change negative to positive to avoid two's complementary + */ + if (leftCover>=30000){ + leftCover=30000; + }else if (leftCover<=-30000){ + leftCover=30000-(-30000); + }else if (leftCover<0){ + leftCover=30000-leftCover; + } + + if (rightCover>=30000){ + rightCover=30000; + }else if (rightCover<=-30000){ + rightCover=30000-(-30000); + }else if (rightCover<0){ + rightCover=30000-rightCover; + } + + info |= ((long) leftCover << 32) ; // move one integer (32 bits) to the left + info |= ((long) rightCover); // 01--LeftCover---RightCover + + return info; + } + + private long[] seq2array(Seq a){ + long[] array =new long[a.length()]; + for (int i = 0; i < a.length(); i++) { + array[i] = (Long) a.apply(i); + } + return array; + } + + private String BinaryBlocksToString (long[] binaryBlocks){ + String KmerString=""; + int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + KmerString += currentNucleotide; + } + + return KmerString; + } + + /** + * + */ + public void tmpKmerRandomizer() throws Exception { + if (tmpReflexivKmerExtendList.size() != 0) { + for (int i = 0; i < tmpReflexivKmerExtendList.size(); i++) { + singleKmerRandomizer(tmpReflexivKmerExtendList.get(i)); + } + } + } + + private boolean subKmerSlotComparator(Seq a, Seq b) { + long[] arrayA = seq2array(a); + long[] arrayB = seq2array(b); + + int aLength= currentKmerSizeFromBinaryBlockArray(arrayA); + int bLength= currentKmerSizeFromBinaryBlockArray(arrayB); + + // String arrayAString = BinaryBlocksToString(arrayA); + // String arrayBString = BinaryBlocksToString(arrayB); + if (aLength==bLength){ + // System.out.println("equal comparator: " + arrayAString + " B: " + arrayBString); + + } + + if (a.length() != b.length()){ + return false; + } + + for (int i = 0; i < a.length(); i++) { + if (!a.apply(i).equals(b.apply(i))) { + return false; + } + } + + return true; + } + } + + class FastqTuple2Dataset implements MapPartitionsFunction, Row>, Serializable { + + + List kmerList = new ArrayList(); + int SeedKmerSize=15; + int readLength; + Long ID; + char nucleotide; + long nucleotideInt; + + int currentExtensionSize; + int currentExtensionBlockSize; + String extension; + + + public Iterator call(Iterator> s) { + + while (s.hasNext()) { + Tuple2 sTuple = s.next(); + ID= sTuple._2; + extension = sTuple._1; + readLength = extension.length(); + + if (readLength - SeedKmerSize - param.endClip <= 1 || param.frontClip > readLength) { + continue; + } + + currentExtensionSize = extension.length(); + currentExtensionBlockSize = (currentExtensionSize-1)/31+1; + + // if (!kmerSizeCheck(kmer, param.kmerListHash)){continue;} // the kmer length does not fit into any of the kmers in the list. + + + long[] extensionBinarySlot = new long[currentExtensionBlockSize]; + + for (int i = 0; i < currentExtensionSize; i++) { + nucleotide = extension.charAt(i); + if (nucleotide >= 256) nucleotide = 255; + nucleotideInt = nucleotideValue(nucleotide); + // forward kmer in bits + nucleotideInt <<= 2*(32-1-(i%31)); // shift to the left [ATCGGATCC-,ATCGGATCC-] + + extensionBinarySlot[i / 31] |= nucleotideInt; + } + + long kmerEndMark = 1L; + + kmerEndMark <<= 2*(32-1-((currentExtensionSize-1)%31+1)); + extensionBinarySlot[currentExtensionBlockSize-1] |= kmerEndMark; // param.kmerListHash.get(currentKmerSize)] == currentKmerBlockSize + + // attribute= onlyChangeReflexivMarker(attribute,1); + kmerList.add( + RowFactory.create(ID, extensionBinarySlot) + ); + } + return kmerList.iterator(); + } + + private long nucleotideValue(char a) { + long value; + if (a == 'A') { + value = 0L; + } else if (a == 'C') { + value = 1L; + } else if (a == 'G') { + value = 2L; + } else { // T + value = 3L; + } + return value; + } + } + + class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Row>, Serializable { + + + List kmerList = new ArrayList(); + int SeedKmerSize=15; + long maxKmerBits = ~((~0L) << (2 * SeedKmerSize)); + int readLength; + String[] units; + String read; + Long ID; + char nucleotide; + long nucleotideInt; + long nucleotideIntComplement; + + long forwardSeed; + long reverseSeed; + + public Iterator call(Iterator> s) { + + while (s.hasNext()) { + Tuple2 sTuple = s.next(); + ID = sTuple._2; + read = sTuple._1; + readLength = read.length(); + + + // System.out.println(read); + + if (readLength - param.kmerSize - param.endClip + 1 <= 0 || param.frontClip > readLength) { + continue; + } + + Long nucleotideBinary = 0L; + Long nucleotideBinaryReverseComplement = 0L; + long[] nucleotideBinarySlot = new long[param.kmerBinarySlots]; + long[] nucleotideBinaryReverseComplementSlot = new long[param.kmerBinarySlots]; + + for (int i = param.frontClip; i < readLength - param.endClip; i++) { + nucleotide = read.charAt(i); + if (nucleotide >= 256) nucleotide = 255; + nucleotideInt = nucleotideValue(nucleotide); + + // forward kmer in bits + if (i - param.frontClip <= param.kmerSize - 1) { + nucleotideBinary <<= 2; + nucleotideBinary |= nucleotideInt; + + if ((i - param.frontClip + 1) % 32 == 0) { // each 32 nucleotides fill a slot + nucleotideBinarySlot[(i - param.frontClip + 1) / 32 - 1] = nucleotideBinary; + nucleotideBinary = 0L; + } + + if (i - param.frontClip == param.kmerSize - 1) { // start completing the first kmer + nucleotideBinary &= maxKmerBits; + nucleotideBinarySlot[(i - param.frontClip + 1) / 32] = nucleotideBinary; // (i-param.frontClip+1)/32 == nucleotideBinarySlot.length -1 + nucleotideBinary = 0L; + + // reverse complement + + } + } else { + // the last block, which is shorter than 32 mer + Long transitBit1 = nucleotideBinarySlot[param.kmerBinarySlots - 1] >>> 2 * (param.kmerSizeResidue - 1); // 0000**---------- -> 000000000000** + // for the next block + Long transitBit2; // for the next block + + // update the last block of kmer binary array + nucleotideBinarySlot[param.kmerBinarySlots - 1] <<= 2; // 0000------------- -> 00------------00 + nucleotideBinarySlot[param.kmerBinarySlots - 1] |= nucleotideInt; // 00------------00 -> 00------------** + nucleotideBinarySlot[param.kmerBinarySlots - 1] &= maxKmerBits; // 00------------** -> 0000----------** + + // the rest + for (int j = param.kmerBinarySlots - 2; j >= 0; j--) { + transitBit2 = nucleotideBinarySlot[j] >>> (2 * 31); // **--------------- -> 0000000000000** + nucleotideBinarySlot[j] <<= 2; // --------------- -> --------------00 + nucleotideBinarySlot[j] |= transitBit1; // -------------00 -> -------------** + transitBit1 = transitBit2; + } + } + + // reverse kmer binarizationalitivities :) non English native speaking people making fun of English + nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) + + if (i - param.frontClip <= param.kmerSize - 1) { + if (i - param.frontClip < param.kmerSizeResidue - 1) { + nucleotideIntComplement <<= 2 * (i - param.frontClip); // + nucleotideBinaryReverseComplement |= nucleotideIntComplement; + } else if (i - param.frontClip == param.kmerSizeResidue - 1) { + nucleotideIntComplement <<= 2 * (i - param.frontClip); + nucleotideBinaryReverseComplement |= nucleotideIntComplement; + nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots - 1] = nucleotideBinaryReverseComplement; // param.kmerBinarySlot-1 = nucleotideBinaryReverseComplementSlot.length -1 + nucleotideBinaryReverseComplement = 0L; + + /** + * param.kmerSizeResidue is the last block length; + * i-param.frontClip is the index of the nucleotide on the sequence; + * +1 change index to length + */ + } else if ((i - param.frontClip - param.kmerSizeResidue + 1) % 32 == 0) { // + + nucleotideIntComplement <<= 2 * ((i - param.frontClip - param.kmerSizeResidue) % 32); // length (i- param.frontClip-param.kmerSizeResidue +1) -1 shift + nucleotideBinaryReverseComplement |= nucleotideIntComplement; + + // filling the blocks in a reversed order + nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots - ((i - param.frontClip - param.kmerSizeResidue + 1) / 32) - 1] = nucleotideBinaryReverseComplement; + nucleotideBinaryReverseComplement = 0L; + } else { + nucleotideIntComplement <<= 2 * ((i - param.frontClip - param.kmerSizeResidue) % 32); // length (i- param.frontClip-param.kmerSizeResidue +1) -1 shift + nucleotideBinaryReverseComplement |= nucleotideIntComplement; + } + } else { + // the first transition bit from the first block + long transitBit1 = nucleotideBinaryReverseComplementSlot[0] << 2 * 31; + long transitBit2; + + nucleotideBinaryReverseComplementSlot[0] >>>= 2; + nucleotideIntComplement <<= 2 * 31; + nucleotideBinaryReverseComplementSlot[0] |= nucleotideIntComplement; + + for (int j = 1; j < param.kmerBinarySlots - 1; j++) { + transitBit2 = nucleotideBinaryReverseComplementSlot[j] << 2 * 31; + nucleotideBinaryReverseComplementSlot[j] >>>= 2; + // transitBit1 <<= 2*31; + nucleotideBinaryReverseComplementSlot[j] |= transitBit1; + transitBit1 = transitBit2; + } + + nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots - 1] >>>= 2; + transitBit1 >>>= 2 * (31 - param.kmerSizeResidue + 1); + nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots - 1] |= transitBit1; + } + + + // reach the first complete K-mer + if (i - param.frontClip >= param.kmerSize - 1) { + + kmerList.add(RowFactory.create(nucleotideBinarySlot, ID, i)); // the number does not matter, as the count is based on units + kmerList.add(RowFactory.create(nucleotideBinaryReverseComplementSlot, -ID, readLength-param.kmerSize-i)); + } + } + } + + + return kmerList.iterator(); + } + + + private String BinaryLongToString (long binaryBlocks){ // this one has been modified for k-mer 15 + // String KmerString=""; + // int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); + int KmerLength = 15; + StringBuilder sb= new StringBuilder(); + char currentNucleotide; + + for (int i=0; i< KmerLength; i++){ + Long currentNucleotideBinary = binaryBlocks>>> 2 * (32 - (i%31+1)); + currentNucleotideBinary &= 3L; + currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); + sb.append(currentNucleotide); + } + + return sb.toString(); + } + + private char BinaryToNucleotide(Long twoBits) { + char nucleotide; + if (twoBits == 0) { + nucleotide = 'A'; + } else if (twoBits == 1) { + nucleotide = 'C'; + } else if (twoBits == 2) { + nucleotide = 'G'; + } else { + nucleotide = 'T'; + } + return nucleotide; + } + + + private long buildingAlongForCompression(long kmer, int index, int ROC){ // ROC read or contig + // xxxxxxxxxC|R----index assuming contig length smaller than 1G + + long ROCLong = (long) ROC << 2*15; + kmer|= ROCLong; + return kmer|(long) index; + } + + + private long nucleotideValue(char a) { + long value; + if (a == 'A') { + value = 0L; + } else if (a == 'C') { + value = 1L; + } else if (a == 'G') { + value = 2L; + } else { // T + value = 3L; + } + return value; + } + } + + /** + * interface class for RDD implementation, used in step 3 + * ----------- + * ------ + * ------ + * ------ + * ------ + * ------ + * ------ + */ + + + class DSFastqUnitFilter implements FilterFunction, Serializable { + public boolean call(String s) { + return s != null; + } + } + + /** + * interface class for RDD implementation, Used in step 1 + */ + + + class DSFastqFilterWithQual implements MapFunction, Serializable { + String line = ""; + int lineMark = 0; + + public String call(String s) { + if (lineMark == 2) { + lineMark++; + line = line + "\n" + s; + return null; + } else if (lineMark == 3) { + lineMark++; + line = line + "\n" + s; + return line; + } else if (s.startsWith("@")) { + line = s; + lineMark = 1; + return null; + } else if (lineMark == 1) { + line = line + "\n" + s; + lineMark++; + return null; + } else { + return null; + } + } + } + + /** + * interface class for RDD implementation, used in step 2 + */ + + + /** + * + * @param param + */ + public void setParam(DefaultParam param) { + this.param = param; + } +} diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java index 1687311..7b4b86a 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSorting.java @@ -116,10 +116,8 @@ public void assemblyFromKmer() { String checkpointDir= sc.getCheckpointDir().get(); Dataset KmerCountDS; - Dataset LongerKmerCountDS; Dataset KmerBinaryCountDS; - DatasetLongerKmerBinaryCountDS; StructType kmerCountTupleStruct = new StructType(); kmerCountTupleStruct = kmerCountTupleStruct.add("kmer", DataTypes.createArrayType(DataTypes.LongType), false); @@ -159,10 +157,6 @@ public void assemblyFromKmer() { ReflexivFullKmerStringStruct = ReflexivFullKmerStringStruct.add("reflection", DataTypes.StringType, false); ExpressionEncoder ReflexivFullKmerStringEncoder = RowEncoder.apply(ReflexivFullKmerStringStruct); - - - - /** * loading Kmer counts */ @@ -188,10 +182,6 @@ public void assemblyFromKmer() { ) ); - // if (param.partitions > 0) { - // LongerKmerBinaryCountDS = LongerKmerBinaryCountDS.repartition(param.partitions); - // } - if (param.cache) { KmerBinaryCountDS.cache(); @@ -199,12 +189,8 @@ public void assemblyFromKmer() { KmerBinaryCountDS = KmerBinaryCountDS.mapPartitions(DSRCKmer, KmerBinaryCountEncoder); -// KmerBinaryCountDS.show(); - ReflexivSubKmerDS = KmerBinaryCountDS.mapPartitions(DSextractForwardSubKmer, ReflexivSubKmerCompressedEncoder); -// ReflexivSubKmerDS.show(); - if (param.bubble == true) { ReflexivSubKmerDS = ReflexivSubKmerDS.sort("k-1"); if (param.minErrorCoverage == 0) { @@ -233,10 +219,6 @@ public void assemblyFromKmer() { */ ReflexivFullKmerDS= ReflexivSubKmerDS.mapPartitions(DSSubKmerToFullLengthKmer, ReflexivFullKmerEncoder); -/* - LongerKmerToEnglightenKmer LongerKmerEnlightmentPreparation = new LongerKmerToEnglightenKmer(); - ReflexivFullKmerDS =ReflexivSubKmerDS.mapPartitions(LongerKmerEnlightmentPreparation, ReflexivFullKmerEncoder); -*/ DSBinaryFullKmerArrayToString FullKmerToStringLong = new DSBinaryFullKmerArrayToString(); @@ -260,19 +242,6 @@ public void assemblyFromKmer() { } - class TagContigID implements FlatMapFunction, Long>, String>, Serializable { - - public Iterator call(Tuple2, Long> s) { - - - List contigList = new ArrayList(); - - contigList.add(s._1._1 + "-" + s._2 + "\n" + s._1._2); - - return contigList.iterator(); - } - } - class DSBinaryFullKmerArrayToString implements MapPartitionsFunction, Serializable { List reflexivKmerStringList = new ArrayList(); @@ -465,7 +434,7 @@ public Iterator call(Iterator s) { Row subKmer = s.next(); int reflexivMarker = getReflexivMarker(subKmer.getLong(1)); int leftMarker = getLeftMarker(subKmer.getLong(1)); - int rightMarker = getRightMarker(subKmer.getLong(1)); + int rightMarker = getRightMarker(subKmer.getLong(1)); // should use rightMarker here. However since in the beginning, left and right are the same as coverage, it does not matter long[] subKmerArray = seq2array(subKmer.getSeq(0)); long attribute=0; @@ -482,7 +451,7 @@ public Iterator call(Iterator s) { int highestLeftMarker = getLeftMarker(HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getLong(1)); if (subKmerSlotComparator(subKmer.getSeq(0), HighCoverageSubKmer.get(HighCoverageSubKmer.size() - 1).getSeq(0)) == true) { if (leftMarker > highestLeftMarker) { - if (highestLeftMarker <= param.minErrorCoverage && leftMarker >= param.minRepeatFold * highestLeftMarker) { + if (highestLeftMarker <= param.minErrorCoverage && leftMarker >= param.minRepeatFold * highestLeftMarker) { // should use rightMarker here . However, since in the beginning, left and right are the same as coverage, it does not matter attribute = buildingAlongFromThreeInt(reflexivMarker, leftMarker, -1); HighCoverageSubKmer.set(HighCoverageSubKmer.size() - 1, RowFactory.create(subKmer.getSeq(0), attribute, subKmer.getLong(2)) diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSortingOld.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSortingOld.java index 2b4d5d4..a2fdd48 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSortingOld.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerLeftAndRightSortingOld.java @@ -145,10 +145,7 @@ public void assemblyFromKmer() { String checkpointDir= sc.getCheckpointDir().get(); Dataset KmerCountDS; - Dataset LongerKmerCountDS; - Dataset KmerBinaryCountDS; - DatasetLongerKmerBinaryCountDS; StructType kmerCountTupleStruct = new StructType(); kmerCountTupleStruct = kmerCountTupleStruct.add("kmer", DataTypes.createArrayType(DataTypes.LongType), false); @@ -165,7 +162,7 @@ public void assemblyFromKmer() { ReflexivKmerStruct = ReflexivKmerStruct.add("right", DataTypes.IntegerType, false); ExpressionEncoder ReflexivSubKmerEncoder = RowEncoder.apply(ReflexivKmerStruct); - Dataset ReflexivSubKmerDSCompressed; + StructType ReflexivKmerStructCompressedStruct = new StructType(); ReflexivKmerStructCompressedStruct = ReflexivKmerStructCompressedStruct.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); ReflexivKmerStructCompressedStruct = ReflexivKmerStructCompressedStruct.add("reflection", DataTypes.LongType, false); @@ -173,15 +170,11 @@ public void assemblyFromKmer() { ExpressionEncoder ReflexivSubKmerCompressedEncoder= RowEncoder.apply(ReflexivKmerStructCompressedStruct); Dataset ReflexivFullKmerDS; - Dataset MixedFullKmerDS; - Dataset MixedReflexivSubkmerDS; StructType FullKmerWithAttributeStruct = new StructType(); FullKmerWithAttributeStruct = FullKmerWithAttributeStruct.add("k", DataTypes.createArrayType(DataTypes.LongType), false); FullKmerWithAttributeStruct = FullKmerWithAttributeStruct.add("reflection", DataTypes.LongType, false); ExpressionEncoder ReflexivFullKmerEncoder= RowEncoder.apply(FullKmerWithAttributeStruct); - - Dataset DSFullKmerStringShort; Dataset DSFullKmerString; StructType ReflexivFullKmerStringStruct = new StructType(); ReflexivFullKmerStringStruct = ReflexivFullKmerStringStruct.add("k", DataTypes.StringType, false); @@ -262,10 +255,6 @@ public void assemblyFromKmer() { */ ReflexivFullKmerDS= ReflexivSubKmerDS.mapPartitions(DSSubKmerToFullLengthKmer, ReflexivFullKmerEncoder); -/* - LongerKmerToEnglightenKmer LongerKmerEnlightmentPreparation = new LongerKmerToEnglightenKmer(); - ReflexivFullKmerDS =ReflexivSubKmerDS.mapPartitions(LongerKmerEnlightmentPreparation, ReflexivFullKmerEncoder); -*/ DSBinaryFullKmerArrayToStringLong FullKmerToStringLong = new DSBinaryFullKmerArrayToStringLong(); @@ -306,8 +295,6 @@ class DSBinaryFullKmerArrayToStringLong implements MapPartitionsFunction reflexivKmerStringList = new ArrayList(); public Iterator call(Iterator sIterator) { - // Timestamp timestamp = new Timestamp(System.currentTimeMillis()); - // System.out.println(timestamp + "RepeatCheck DSBinaryFullKmerArrayToStringLong: " + param.kmerSize1); while (sIterator.hasNext()) { Row s = sIterator.next(); @@ -1109,12 +1096,6 @@ private int getRightMarker(long attribute){ } - - /** - * - */ - - class DSReflectedSubKmerExtractionFromForward implements MapPartitionsFunction, Serializable { List TupleList = new ArrayList(); Long suffixBinary; @@ -1569,15 +1550,6 @@ private String BinaryBlocksToString (long[] binaryBlocks){ } } - /** - * - */ - - - /** - * interface class for RDD implementation, used in step 5 - */ - /** * interface class for RDD implementation, used in step 4 */ @@ -1780,136 +1752,6 @@ private long nucleotideValue(char a) { } - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable { - long maxKmerBits = ~((~0L) << (2 * param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - - /** - * interface class for RDD implementation, used in step 3 - * ----------- - * ------ - * ------ - * ------ - * ------ - * ------ - * ------ - */ - - - class DSFastqUnitFilter implements FilterFunction, Serializable { - public boolean call(String s) { - return s != null; - } - } - - /** - * interface class for RDD implementation, Used in step 1 - */ - - - class DSFastqFilterWithQual implements MapFunction, Serializable { - String line = ""; - int lineMark = 0; - - public String call(String s) { - if (lineMark == 2) { - lineMark++; - line = line + "\n" + s; - return null; - } else if (lineMark == 3) { - lineMark++; - line = line + "\n" + s; - return line; - } else if (s.startsWith("@")) { - line = s; - lineMark = 1; - return null; - } else if (lineMark == 1) { - line = line + "\n" + s; - lineMark++; - return null; - } else { - return null; - } - } - } - - /** - * interface class for RDD implementation, used in step 2 - */ - - /** * * @param param diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerProcessing64.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerProcessing64.java index 1e17931..d8c0ceb 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerProcessing64.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSKmerProcessing64.java @@ -217,11 +217,6 @@ public void assemblyFromKmer() { KmerCountDS = spark.read().csv(param.inputKmerPath + "_" + param.kmerSize1); LongerKmerCountDS = spark.read().csv(param.inputKmerPath + "_" + param.kmerSize2); - /* - if (param.partitions > 0) { - KmerCountDS = KmerCountDS.repartition(param.partitions); - } -*/ /** * Transforming kmer string to binary kmer */ @@ -294,8 +289,6 @@ public void assemblyFromKmer() { ReflexivSubKmerDS = ReflexivSubKmerDS.sort("k-1"); DSBinaryReflexivKmerToString StringOutputDS = new DSBinaryReflexivKmerToString(); - // Dataset ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); - //ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + 1); DSExtendReflexivKmer DSKmerExtention = new DSExtendReflexivKmer(); ReflexivSubKmerDS = ReflexivSubKmerDS.mapPartitions(DSKmerExtention, ReflexivSubKmerEncoder); @@ -313,10 +306,6 @@ public void assemblyFromKmer() { iterations++; - //ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, ReflexivKmerStringEncoder); - // ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); - //ReflexivSubKmerStringDS.write().format("csv").save(param.outputPath + iterations); - /** * Extract Long sub kmer */ @@ -431,18 +420,8 @@ public void assemblyFromKmer() { System.out.println("mark iteration: " + iterations + " has kmers: " + IterationCount); ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); -// ReflexivLongSubKmerDS.cache(); -// ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); -// ReflexivLongSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringDS= ReflexivLongSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); -// ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations); - ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.mapPartitions(DSKmerExtenstionArrayToArray, ReflexivLongKmerEncoder); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations + "Extend"); } } @@ -573,18 +552,6 @@ public void assemblyFromKmer() { ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); - /** - * - */ - // DSKmerToContigLength contigLengthDS = new DSKmerToContigLength(); - // ContigLengthRows = ReflexivLongSubKmerStringDS.mapPartitions(contigLengthDS, ContigLengthEncoder); - - - // DSFormatContigs ContigFormater = new DSFormatContigs(); - // ContigRows= ContigMergedRow.mapPartitions(ContigFormater, ContigStringEncoder); - - - DSKmerToContig contigformaterDS = new DSKmerToContig(); ContigRows = ReflexivLongSubKmerStringDS.mapPartitions(contigformaterDS, ContigStringEncoder); @@ -629,19 +596,6 @@ public Iterator call(Tuple2 s) { } } - class TagContigID implements FlatMapFunction, Long>, String>, Serializable { - - public Iterator call(Tuple2, Long> s) { - - - List contigList = new ArrayList(); - - contigList.add(s._1._1 + "-" + s._2 + "\n" + s._1._2); - - return contigList.iterator(); - } - } - class DSKmerToContig implements MapPartitionsFunction, Serializable { public Iterator call(Iterator sIterator) { @@ -10493,19 +10447,6 @@ public void singleKmerRandomizer(Row currentSubKmer) { } } - /** - * - */ - - - /** - * interface class for RDD implementation, used in step 5 - */ - - /** - * interface class for RDD implementation, used in step 4 - */ - class DSKmerReverseComplement implements MapPartitionsFunction, Serializable { /* a capsule for all Kmers and reverseComplementKmers */ @@ -10573,38 +10514,6 @@ private char BinaryToNucleotide(Long twoBits) { } } - class DSKmerReverseComplementLong implements MapPartitionsFunction, Serializable { - /* a capsule for all Kmers and reverseComplementKmers */ - List kmerList = new ArrayList(); - Long reverseComplement; - Row kmerTuple; - Long lastTwoBits; - Long kmerBinary; - - - public Iterator call(Iterator s) { - - - while (s.hasNext()) { - kmerTuple = s.next(); - kmerBinary = kmerTuple.getLong(0); - reverseComplement = 0L; - for (int i = 0; i < param.kmerSize; i++) { - reverseComplement <<= 2; - - lastTwoBits = kmerBinary & 3L ^ 3L; - kmerBinary >>>= 2; - reverseComplement |= lastTwoBits; - } - - kmerList.add(RowFactory.create(kmerTuple.getLong(0), (int) kmerTuple.getLong(1))); - kmerList.add(RowFactory.create(reverseComplement, (int) kmerTuple.getLong(1))); - } - - return kmerList.iterator(); - } - } - class KmerBinarizer implements MapPartitionsFunction, Serializable { List kmerList = new ArrayList(); @@ -10687,402 +10596,6 @@ private long nucleotideValue(char a) { } - class KmerLongerBinarizer implements MapPartitionsFunction, Serializable { - - List kmerList = new ArrayList(); - Row units; - String kmer; - int cover; - char nucleotide; - long nucleotideInt; - // Long suffixBinary; - // Long[] suffixBinaryArray; - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - - units = s.next(); - - kmer = units.getString(0); - - if (kmer.startsWith("(")) { - kmer = kmer.substring(1); - } - - if (units.getString(1).endsWith(")")) { - if (units.getString(1).length() >= 11) { - cover = 1000000000; - } else { - cover = Integer.parseInt(StringUtils.chop(units.getString(1))); - } - } else { - if (units.getString(1).length() >= 10) { - cover = 1000000000; - } else { - cover = Integer.parseInt(units.getString(1)); - } - } - - long[] nucleotideBinarySlot = new long[param.kmerBinarySlotsAssemble]; - // Long nucleotideBinary = 0L; - - for (int i = 0; i < param.kmerSize1; i++) { - nucleotide = kmer.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinarySlot[i / 31] <<= 2; - nucleotideBinarySlot[i / 31] |= nucleotideInt; - - // nucleotideBinary <<= 2; - // nucleotideBinary |= nucleotideInt; - } - - kmerList.add( - RowFactory.create(nucleotideBinarySlot, cover) - ); - - // kmerList.add( - // new Tuple2( - // nucleotideBinary, cover - // ) - //); - } - - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - - } - - class ReverseComplementKmerBinaryExtractionFromDataset64 implements MapPartitionsFunction, Serializable{ - long maxKmerBits= ~((~0L) << (2*param.kmerSizeResidue)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s){ - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - - if (readLength - param.kmerSize - param.endClip +1 <= 0 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - long[] nucleotideBinarySlot = new long[param.kmerBinarySlots]; - long[] nucleotideBinaryReverseComplementSlot = new long[param.kmerBinarySlots]; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - - // forward kmer in bits - if (i - param.frontClip <= param.kmerSize-1) { - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - - if ((i - param.frontClip+1) % 32 == 0) { // each 32 nucleotides fill a slot - nucleotideBinarySlot[(i - param.frontClip+1) / 32 - 1] = nucleotideBinary; - nucleotideBinary = 0L; - } - - if (i - param.frontClip == param.kmerSize-1) { // start completing the first kmer - nucleotideBinary &= maxKmerBits; - nucleotideBinarySlot[(i - param.frontClip+1) / 32] = nucleotideBinary; // (i-param.frontClip+1)/32 == nucleotideBinarySlot.length -1 - nucleotideBinary = 0L; - - // reverse complement - - } - }else{ - // the last block, which is shorter than 32 mer - Long transitBit1 = nucleotideBinarySlot[param.kmerBinarySlots-1] >>> 2*(param.kmerSizeResidue-1) ; // 0000**---------- -> 000000000000** - // for the next block - Long transitBit2; // for the next block - - // update the last block of kmer binary array - nucleotideBinarySlot[param.kmerBinarySlots-1] <<= 2; // 0000------------- -> 00------------00 - nucleotideBinarySlot[param.kmerBinarySlots-1] |= nucleotideInt; // 00------------00 -> 00------------** - nucleotideBinarySlot[param.kmerBinarySlots-1] &= maxKmerBits; // 00------------** -> 0000----------** - - // the rest - for (int j = param.kmerBinarySlots-2; j >=0; j--) { - transitBit2 = nucleotideBinarySlot[j] >>> (2*31); // **--------------- -> 0000000000000** - nucleotideBinarySlot[j] <<=2; // --------------- -> --------------00 - nucleotideBinarySlot[j] |= transitBit1; // -------------00 -> -------------** - transitBit1= transitBit2; - } - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip <= param.kmerSize -1){ - if (i-param.frontClip < param.kmerSizeResidue-1){ - nucleotideIntComplement <<=2 * (i-param.frontClip); // - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - }else if (i-param.frontClip == param.kmerSizeResidue-1){ - nucleotideIntComplement <<=2 * (i-param.frontClip); - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots-1] = nucleotideBinaryReverseComplement; // param.kmerBinarySlot-1 = nucleotideBinaryReverseComplementSlot.length -1 - nucleotideBinaryReverseComplement =0L; - - /** - * param.kmerSizeResidue is the last block length; - * i-param.frontClip is the index of the nucleotide on the sequence; - * +1 change index to length - */ - }else if ((i- param.frontClip-param.kmerSizeResidue +1) % 32 ==0){ // - - nucleotideIntComplement <<= 2 * ((i - param.frontClip-param.kmerSizeResidue) % 32); // length (i- param.frontClip-param.kmerSizeResidue +1) -1 shift - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // filling the blocks in a reversed order - nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots - ((i- param.frontClip-param.kmerSizeResidue +1)/32) -1]= nucleotideBinaryReverseComplement; - nucleotideBinaryReverseComplement=0L; - } else{ - nucleotideIntComplement <<= 2 * ((i - param.frontClip-param.kmerSizeResidue) % 32); // length (i- param.frontClip-param.kmerSizeResidue +1) -1 shift - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - } - }else { - // the first transition bit from the first block - long transitBit1 = nucleotideBinaryReverseComplementSlot[0] << 2*31; - long transitBit2; - - nucleotideBinaryReverseComplementSlot[0] >>>= 2; - nucleotideIntComplement <<= 2*31; - nucleotideBinaryReverseComplementSlot[0] |= nucleotideIntComplement; - - for (int j=1; j>>= 2; - // transitBit1 <<= 2*31; - nucleotideBinaryReverseComplementSlot[j] |= transitBit1; - transitBit1 = transitBit2; - } - - nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots-1] >>>= 2; - transitBit1 >>>= 2*(31-param.kmerSizeResidue+1); - nucleotideBinaryReverseComplementSlot[param.kmerBinarySlots-1] |= transitBit1; - } - - /* - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; -*/ - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - - - - - if (compareLongArrayBlocks(nucleotideBinarySlot, nucleotideBinaryReverseComplementSlot) == true) { - // System.out.println(nucleotideBinarySlot[0] + " forward " + nucleotideBinarySlot[1] + " rc " + nucleotideBinaryReverseComplementSlot[0]); - - long[] nucleotideBinarySlotPreRow = new long[param.kmerBinarySlots]; - for (int j=0; j>> (2 * (31 - j)); - shiftedBinary1 &= 3L; - long shiftedBinary2 = reverse[i] >>> (2 * (31 - j)); - shiftedBinary2 &= 3L; - - if (shiftedBinary1 < shiftedBinary2) { - return true; - } else if (shiftedBinary1 > shiftedBinary2) { - return false; - } - } - }else{ - for (int j = 0; j < param.kmerSizeResidue; j++) { - long shiftedBinary1 = forward[i] >>> (2 * (param.kmerSizeResidue -1 - j)); - shiftedBinary1 &= 3L; - long shiftedBinary2 = reverse[i] >>> (2 * (param.kmerSizeResidue -1 - j)); - shiftedBinary2 &= 3L; - - if (shiftedBinary1 < shiftedBinary2) { - return true; - } else if (shiftedBinary1 > shiftedBinary2) { - return false; - } - } - } - } - - // should not happen - return true; - } - - // for testing, remove afterwards - private char BinaryToNucleotide (Long twoBits){ - char nucleotide; - if (twoBits == 0L){ - nucleotide = 'A'; - }else if (twoBits == 1L){ - nucleotide = 'C'; - }else if (twoBits == 2L){ - nucleotide = 'G'; - }else{ - nucleotide = 'T'; - } - return nucleotide; - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - - private boolean compareLongArray (Long[] a, Long[] b){ - - return true; - } - - private Long[] shiftLongArrayBinary (Long[] previousKmer){ - return previousKmer; - } - } - - - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable { - long maxKmerBits = ~((~0L) << (2 * param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - /** * interface class for RDD implementation, used in step 3 * ----------- diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain.java index 20bea97..5fa6bb9 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain.java @@ -523,8 +523,7 @@ public void assemblyFromKmer(){ ReflexivSubKmerDS = ReflexivSubKmerDS.sort("k-1"); DSBinaryReflexivKmerToString StringOutputDS = new DSBinaryReflexivKmerToString(); - // Dataset ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); - //ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + 1); + DSExtendReflexivKmer DSKmerExtention = new DSExtendReflexivKmer(); ReflexivSubKmerDS = ReflexivSubKmerDS.mapPartitions(DSKmerExtention, ReflexivSubKmerEncoder); @@ -541,9 +540,6 @@ public void assemblyFromKmer(){ // ReflexivSubKmerDS.cache(); iterations++; - //ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, ReflexivKmerStringEncoder); - // ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); - //ReflexivSubKmerStringDS.write().format("csv").save(param.outputPath + iterations); /** * Extract Long sub kmer @@ -589,19 +585,9 @@ public void assemblyFromKmer(){ ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); -// ReflexivLongSubKmerDS.cache(); -// ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); -// ReflexivLongSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringDS= ReflexivLongSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); -// ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations); ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.mapPartitions(DSKmerExtenstionArrayToArray, ReflexivLongKmerEncoder); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations + "Extend"); - } ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain64.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain64.java index 2d86879..df75e1d 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain64.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMain64.java @@ -151,12 +151,7 @@ public void assembly() { kmerCountTupleStruct = kmerCountTupleStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); kmerCountTupleStruct = kmerCountTupleStruct.add("count", DataTypes.IntegerType, false); ExpressionEncoder KmerBinaryCountEncoder = RowEncoder.apply(kmerCountTupleStruct); -/* - StructType kmerBinaryStruct = new StructType(); - kmerBinaryStruct = kmerBinaryStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); - kmerBinaryStruct = kmerBinaryStruct.add("count", DataTypes.IntegerType, false); - ExpressionEncoder kmerBinaryEncoder = RowEncoder.apply(kmerBinaryStruct); -*/ + Dataset ReflexivSubKmerDS; StructType ReflexivKmerStruct = new StructType(); ReflexivKmerStruct = ReflexivKmerStruct.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); @@ -538,8 +533,7 @@ public void assemblyFromKmer() { ReflexivSubKmerDS = ReflexivSubKmerDS.sort("k-1"); DSBinaryReflexivKmerToString StringOutputDS = new DSBinaryReflexivKmerToString(); - // Dataset ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); - //ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + 1); + DSExtendReflexivKmer DSKmerExtention = new DSExtendReflexivKmer(); ReflexivSubKmerDS = ReflexivSubKmerDS.mapPartitions(DSKmerExtention, ReflexivSubKmerEncoder); @@ -557,10 +551,6 @@ public void assemblyFromKmer() { iterations++; - //ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, ReflexivKmerStringEncoder); - // ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); - //ReflexivSubKmerStringDS.write().format("csv").save(param.outputPath + iterations); - /** * Extract Long sub kmer */ @@ -675,18 +665,7 @@ public void assemblyFromKmer() { System.out.println("mark iteration: " + iterations + " has kmers: " + IterationCount); ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); -// ReflexivLongSubKmerDS.cache(); -// ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); -// ReflexivLongSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringDS= ReflexivLongSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); -// ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations); - ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.mapPartitions(DSKmerExtenstionArrayToArray, ReflexivLongKmerEncoder); - -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations + "Extend"); } } @@ -817,18 +796,6 @@ public void assemblyFromKmer() { ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); - /** - * - */ - // DSKmerToContigLength contigLengthDS = new DSKmerToContigLength(); - // ContigLengthRows = ReflexivLongSubKmerStringDS.mapPartitions(contigLengthDS, ContigLengthEncoder); - - - // DSFormatContigs ContigFormater = new DSFormatContigs(); - // ContigRows= ContigMergedRow.mapPartitions(ContigFormater, ContigStringEncoder); - - - DSKmerToContig contigformaterDS = new DSKmerToContig(); ContigRows = ReflexivLongSubKmerStringDS.mapPartitions(contigformaterDS, ContigStringEncoder); @@ -872,20 +839,6 @@ public Iterator call(Tuple2 s) { return contigList.iterator(); } } - - class TagContigID implements FlatMapFunction, Long>, String>, Serializable { - - public Iterator call(Tuple2, Long> s) { - - - List contigList = new ArrayList(); - - contigList.add(s._1._1 + "-" + s._2 + "\n" + s._1._2); - - return contigList.iterator(); - } - } - class DSKmerToContig implements MapPartitionsFunction, Serializable { public Iterator call(Iterator sIterator) { @@ -939,7 +892,6 @@ public String changeLine(String oneLine, int lineLength, int limitedLength) { } } - class DSLowCoverageSubKmerExtraction implements MapPartitionsFunction, Serializable{ List subKmerProb = new ArrayList(); int randomReflexivMarker =1; @@ -10817,38 +10769,6 @@ private char BinaryToNucleotide(Long twoBits) { } } - class DSKmerReverseComplementLong implements MapPartitionsFunction, Serializable { - /* a capsule for all Kmers and reverseComplementKmers */ - List kmerList = new ArrayList(); - Long reverseComplement; - Row kmerTuple; - Long lastTwoBits; - Long kmerBinary; - - - public Iterator call(Iterator s) { - - - while (s.hasNext()) { - kmerTuple = s.next(); - kmerBinary = kmerTuple.getLong(0); - reverseComplement = 0L; - for (int i = 0; i < param.kmerSize; i++) { - reverseComplement <<= 2; - - lastTwoBits = kmerBinary & 3L ^ 3L; - kmerBinary >>>= 2; - reverseComplement |= lastTwoBits; - } - - kmerList.add(RowFactory.create(kmerTuple.getLong(0), (int) kmerTuple.getLong(1))); - kmerList.add(RowFactory.create(reverseComplement, (int) kmerTuple.getLong(1))); - } - - return kmerList.iterator(); - } - } - class KmerBinarizer implements MapPartitionsFunction, Serializable { List kmerList = new ArrayList(); @@ -11170,92 +11090,6 @@ private Long[] shiftLongArrayBinary (Long[] previousKmer){ } - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable { - long maxKmerBits = ~((~0L) << (2 * param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - - /** - * interface class for RDD implementation, used in step 3 - * ----------- - * ------ - * ------ - * ------ - * ------ - * ------ - * ------ - */ - class DSFastqUnitFilter implements FilterFunction, Serializable { public boolean call(String s) { diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMercy.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMercy.java index dad6e30..9ba6a17 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMercy.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMercy.java @@ -2013,82 +2013,6 @@ public void tmpKmerRandomizer(){ } } - - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable{ - long maxKmerBits= ~((~0L) << (2*param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s){ - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - /** * interface class for RDD implementation, used in step 3 * ----------- diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMeta64.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMeta64.java index aedf83d..8a62688 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMeta64.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMainMeta64.java @@ -133,12 +133,7 @@ public void assembly() { kmerCountTupleStruct = kmerCountTupleStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); kmerCountTupleStruct = kmerCountTupleStruct.add("count", DataTypes.IntegerType, false); ExpressionEncoder KmerBinaryCountEncoder = RowEncoder.apply(kmerCountTupleStruct); -/* - StructType kmerBinaryStruct = new StructType(); - kmerBinaryStruct = kmerBinaryStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); - kmerBinaryStruct = kmerBinaryStruct.add("count", DataTypes.IntegerType, false); - ExpressionEncoder kmerBinaryEncoder = RowEncoder.apply(kmerBinaryStruct); -*/ + Dataset ReflexivSubKmerDS; StructType ReflexivKmerStruct = new StructType(); ReflexivKmerStruct = ReflexivKmerStruct.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); @@ -505,8 +500,6 @@ public void assemblyFromKmer() { ReflexivSubKmerDS = ReflexivSubKmerDS.sort("k-1"); DSBinaryReflexivKmerToString StringOutputDS = new DSBinaryReflexivKmerToString(); - // Dataset ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); - //ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + 1); DSExtendReflexivKmer DSKmerExtention = new DSExtendReflexivKmer(); ReflexivSubKmerDS = ReflexivSubKmerDS.mapPartitions(DSKmerExtention, ReflexivSubKmerEncoder); @@ -524,10 +517,6 @@ public void assemblyFromKmer() { iterations++; - //ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, ReflexivKmerStringEncoder); - // ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); - //ReflexivSubKmerStringDS.write().format("csv").save(param.outputPath + iterations); - /** * Extract Long sub kmer */ @@ -603,10 +592,6 @@ public void assemblyFromKmer() { DSKmerToContig contigformaterDS = new DSKmerToContig(); ContigRows = ReflexivLongSubKmerStringDS.mapPartitions(contigformaterDS, ContigStringEncoder); - /* - DSKmerToSmallContig smallContigformaterDS = new DSKmerToSmallContig(); - SmallContigRows = ReflexivLongSubKmerStringDS.mapPartitions(smallContigformaterDS, ContigStringEncoder); -*/ /** * */ diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMerger.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMerger.java index e186532..ca241b7 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMerger.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSMerger.java @@ -117,62 +117,16 @@ public void assembly(){ info.screenDump(); Dataset FastqDS; - Dataset KmerBinaryDS; - - Dataset KmerBinaryCountLongDS; - Dataset KmerBinaryCountDS; - StructType kmerCountTupleStruct = new StructType(); - kmerCountTupleStruct= kmerCountTupleStruct.add("kmer", DataTypes.LongType, false); - kmerCountTupleStruct= kmerCountTupleStruct.add("count", DataTypes.IntegerType, false); - ExpressionEncoder KmerBinaryCountEncoder = RowEncoder.apply(kmerCountTupleStruct); - - Dataset ReflexivSubKmerDS; - StructType ReflexivKmerStruct = new StructType(); - ReflexivKmerStruct= ReflexivKmerStruct.add("k-1", DataTypes.LongType, false); - ReflexivKmerStruct= ReflexivKmerStruct.add("reflection", DataTypes.IntegerType, false); - ReflexivKmerStruct= ReflexivKmerStruct.add("extension", DataTypes.LongType, false); - ReflexivKmerStruct= ReflexivKmerStruct.add("left", DataTypes.IntegerType, false); - ReflexivKmerStruct= ReflexivKmerStruct.add("right", DataTypes.IntegerType, false); - ExpressionEncoder ReflexivSubKmerEncoder = RowEncoder.apply(ReflexivKmerStruct); - - Dataset ReflexivSubKmerStringDS; - StructType ReflexivKmerStringStruct = new StructType(); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("k-1", DataTypes.StringType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("reflection", DataTypes.IntegerType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("extension", DataTypes.StringType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("left", DataTypes.IntegerType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("right", DataTypes.IntegerType, false); - ExpressionEncoder ReflexivKmerStringEncoder = RowEncoder.apply(ReflexivKmerStringStruct); - - Dataset ReflexivLongSubKmerDS; - StructType ReflexivLongKmerStruct = new StructType(); - ReflexivLongKmerStruct= ReflexivLongKmerStruct.add("k-1", DataTypes.LongType, false); - ReflexivLongKmerStruct= ReflexivLongKmerStruct.add("reflection", DataTypes.IntegerType, false); - ReflexivLongKmerStruct= ReflexivLongKmerStruct.add("extension", DataTypes.createArrayType(DataTypes.LongType), false); - ReflexivLongKmerStruct= ReflexivLongKmerStruct.add("left", DataTypes.IntegerType, false); - ReflexivLongKmerStruct= ReflexivLongKmerStruct.add("right", DataTypes.IntegerType, false); - ExpressionEncoder ReflexivLongKmerEncoder = RowEncoder.apply(ReflexivLongKmerStruct); - - Dataset ReflexivLongSubKmerStringDS; - StructType ReflexivLongKmerStringStruct = new StructType(); - ReflexivLongKmerStringStruct= ReflexivLongKmerStringStruct.add("k-1", DataTypes.StringType, false); - ReflexivLongKmerStringStruct= ReflexivLongKmerStringStruct.add("reflection", DataTypes.IntegerType, false); - ReflexivLongKmerStringStruct= ReflexivLongKmerStringStruct.add("extension", DataTypes.StringType, false); - ReflexivLongKmerStringStruct= ReflexivLongKmerStringStruct.add("left", DataTypes.IntegerType, false); - ReflexivLongKmerStringStruct= ReflexivLongKmerStringStruct.add("right", DataTypes.IntegerType, false); - ExpressionEncoder ReflexivLongKmerStringEncoder = RowEncoder.apply(ReflexivLongKmerStringStruct); Dataset ContigLengthRows; - Dataset ContigLengthRowsLarge; - Dataset ContigLengthRowsSmall; + StructType ContigLengthStruct = new StructType(); ContigLengthStruct = ContigLengthStruct.add("length", DataTypes.DoubleType, false); ContigLengthStruct = ContigLengthStruct.add("contig", DataTypes.StringType, false); ExpressionEncoder ContigLengthEncoder = RowEncoder.apply(ContigLengthStruct); Dataset ContigMergedRow; - Dataset ContigMergedRowLarge; - Dataset ContigMergedRowSmall; + StructType ContigMergedStruct = new StructType(); ContigMergedStruct = ContigMergedStruct.add("contig", DataTypes.StringType, false); ExpressionEncoder ContigMergedEncoder = RowEncoder.apply(ContigMergedStruct); @@ -189,16 +143,8 @@ public void assembly(){ FastqDS = spark.read().text(param.inputFqPath).as(Encoders.STRING()); - // DSFastqFilterWithQual DSFastqFilter = new DSFastqFilterWithQual(); - // FastqDS = FastqDS.map(DSFastqFilter, Encoders.STRING()); - - // DSFastqUnitFilter FilterDSUnit = new DSFastqUnitFilter(); - - // FastqDS = FastqDS.filter(FilterDSUnit); - DSContigInputParser contigParser = new DSContigInputParser(); - ContigLengthRows = FastqDS.mapPartitions(contigParser, ContigLengthEncoder); ContigLengthRows = ContigLengthRows.repartition(param.partitions); @@ -207,8 +153,6 @@ public void assembly(){ * */ - DSMergeReverseComplementaryContigs RCcontigMerger = new DSMergeReverseComplementaryContigs(); - DSMergeRedundantContigs RedundantMerger = new DSMergeRedundantContigs(); DSMergeRedundantNonRCContigs RedundantNonRCMerger = new DSMergeRedundantNonRCContigs(); ContigLengthRows.cache(); @@ -216,16 +160,6 @@ public void assembly(){ ContigLengthRows = ContigLengthRows.sort("length"); - // DSContgiLengthRowsToPairedRDD RowsToPair = new DSContgiLengthRowsToPairedRDD(); - // JavaPairRDD ContigPairedRDD = ContigLengthRows.toJavaRDD().mapPartitionsToPair(RowsToPair); - // ContigPairedRDD = ContigPairedRDD.sortByKey(true, param.partitions); - - // ContigLengthRows = spark.createDataset(JavaPairRDD.toRDD(ContigPairedRDD), Encoders.tuple(Encoders.INT(),Encoders.STRING())).toDF(); - - // PairedRDDToContigLengthRows PairToRows = new PairedRDDToContigLengthRows(); - // ContigLengthRows = ContigPairedRDD.map(PairToRows); - - // ContigLengthRows = ContigLengthRows.sort("length"); ContigLengthRows.cache(); @@ -234,24 +168,12 @@ public void assembly(){ ContigMergedRow = ContigLengthRows.mapPartitions(RedundantNonRCMerger, ContigMergedEncoder); - // ContigMergedRow.cache(); -// ContigMergedRow.toJavaRDD().saveAsTextFile(param.outputPath + 2); - /** * */ DSFormatContigs ContigFormater = new DSFormatContigs(); ContigRows= ContigMergedRow.mapPartitions(ContigFormater, ContigStringEncoder); - /** - * - */ - /* - DSKmerToContig contigformaterDS = new DSKmerToContig(); - ContigRows = ReflexivLongSubKmerStringDS.mapPartitions(contigformaterDS, ContigStringEncoder); - */ - - /** * */ @@ -297,15 +219,6 @@ public void assemblyFromKmer(){ ReflexivKmerStruct= ReflexivKmerStruct.add("right", DataTypes.IntegerType, false); ExpressionEncoder ReflexivSubKmerEncoder = RowEncoder.apply(ReflexivKmerStruct); - Dataset ReflexivSubKmerStringDS; - StructType ReflexivKmerStringStruct = new StructType(); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("k-1", DataTypes.StringType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("reflection", DataTypes.IntegerType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("extension", DataTypes.StringType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("left", DataTypes.IntegerType, false); - ReflexivKmerStringStruct= ReflexivKmerStringStruct.add("right", DataTypes.IntegerType, false); - ExpressionEncoder ReflexivKmerStringEncoder = RowEncoder.apply(ReflexivKmerStringStruct); - Dataset ReflexivLongSubKmerDS; StructType ReflexivLongKmerStruct = new StructType(); ReflexivLongKmerStruct= ReflexivLongKmerStruct.add("k-1", DataTypes.LongType, false); @@ -325,16 +238,13 @@ public void assemblyFromKmer(){ ExpressionEncoder ReflexivLongKmerStringEncoder = RowEncoder.apply(ReflexivLongKmerStringStruct); Dataset ContigLengthRows; - Dataset ContigLengthRowsLarge; - Dataset ContigLengthRowsSmall; StructType ContigLengthStruct = new StructType(); ContigLengthStruct = ContigLengthStruct.add("length", DataTypes.StringType, false); ContigLengthStruct = ContigLengthStruct.add("contig", DataTypes.StringType, false); ExpressionEncoder ContigLengthEncoder = RowEncoder.apply(ContigLengthStruct); Dataset ContigMergedRow; - Dataset ContigMergedRowLarge; - Dataset ContigMergedRowSmall; + StructType ContigMergedStruct = new StructType(); ContigMergedStruct = ContigMergedStruct.add("contig", DataTypes.StringType, false); ExpressionEncoder ContigMergedEncoder = RowEncoder.apply(ContigMergedStruct); @@ -425,10 +335,6 @@ public void assemblyFromKmer(){ ReflexivSubKmerDS = ReflexivSubKmerDS.sort("k-1"); - DSBinaryReflexivKmerToString StringOutputDS = new DSBinaryReflexivKmerToString(); - // Dataset ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); - //ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + 1); - DSExtendReflexivKmer DSKmerExtention = new DSExtendReflexivKmer(); ReflexivSubKmerDS = ReflexivSubKmerDS.mapPartitions(DSKmerExtention, ReflexivSubKmerEncoder); @@ -444,10 +350,6 @@ public void assemblyFromKmer(){ // ReflexivSubKmerDS.cache(); iterations++; - //ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, ReflexivKmerStringEncoder); - // ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); - //ReflexivSubKmerStringDS.write().format("csv").save(param.outputPath + iterations); - /** * Extract Long sub kmer */ @@ -491,19 +393,7 @@ public void assemblyFromKmer(){ ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); -// ReflexivLongSubKmerDS.cache(); -// ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); -// ReflexivLongSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringDS= ReflexivLongSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); -// ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations); - ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.mapPartitions(DSKmerExtenstionArrayToArray, ReflexivLongKmerEncoder); - -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations + "Extend"); - } ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); @@ -518,16 +408,6 @@ public void assemblyFromKmer(){ DSKmerToContigLength contigLengthDS = new DSKmerToContigLength(); ContigLengthRows = ReflexivLongSubKmerStringDS.mapPartitions(contigLengthDS, ContigLengthEncoder); - // ContigLengthRows.cache(); - // ContigLengthRows.toJavaRDD().saveAsTextFile(param.outputPath + iterations); - - - - // ContigLengthRows.cache(); - // ContigLengthRows.toJavaRDD().saveAsTextFile(param.outputPath + iterations + "sort"); - - - DSMergeReverseComplementaryContigs RCcontigMerger = new DSMergeReverseComplementaryContigs(); @@ -539,11 +419,6 @@ public void assemblyFromKmer(){ DSFormatContigs ContigFormater = new DSFormatContigs(); ContigRows= ContigMergedRow.mapPartitions(ContigFormater, ContigStringEncoder); - - /* - DSKmerToContig contigformaterDS = new DSKmerToContig(); - ContigRows = ReflexivLongSubKmerStringDS.mapPartitions(contigformaterDS, ContigStringEncoder); - */ /** * */ @@ -568,484 +443,13 @@ public Iterator call(Tuple2 s) { List contigList = new ArrayList(); - contigList.add(s._1.getString(0) + "-" + s._2 + "\n" + s._1.getString(1)); - - return contigList.iterator(); - } - } - - class TagContigID implements FlatMapFunction, Long>, String>, Serializable { - - public Iterator call(Tuple2, Long> s) { - - - List contigList = new ArrayList(); - - contigList.add(s._1._1 + "-" + s._2 + "\n" + s._1._2); - - return contigList.iterator(); - } - } - - class PairedRDDToContigLengthRows implements Function, Row>, Serializable{ - public Row call(Tuple2 s){ - return RowFactory.create(s._1(), s._2()); - } - - } - - class DSContgiLengthRowsToPairedRDD implements PairFlatMapFunction, Integer, String>, Serializable{ - - List> contigPairList = new ArrayList>(); - - public Iterator> call (Iterator sIterator){ - - while (sIterator.hasNext()){ - Row contigPair = sIterator.next(); - - contigPairList.add(new Tuple2(contigPair.getInt(0), contigPair.getString(1))); - } - - return contigPairList.iterator(); - } - } - - - class DSMergeRedundantNonRCContigs implements MapPartitionsFunction, Serializable{ - List uniqueContig = new ArrayList(); - List contigList = new ArrayList(); - Row contig; - String contigString; - int contigLength; - Hashtable probKmerTable = new Hashtable(); - Hashtable redundantTable = new Hashtable(); - Hashtable overlapTable = new Hashtable(); - Hashtable overlapTableRight = new Hashtable(); - int index=0; - long maxContigLengthBinary = ~((~0L) << 30); - - - public Iterator call (Iterator sIterator){ - while (sIterator.hasNext()){ - contig = sIterator.next(); - index++; - contigString = contig.getString(1); - contigLength = contigString.length(); - String probKmer; - - - if (contigLength >= param.kmerSize){ - probKmer = contigString.substring(0, param.kmerSize); - if (!probKmerTable.containsKey(probKmer)) { - probKmerTable.put(probKmer, index-1); - }else{ - if (contigList.get(probKmerTable.get(probKmer)).length() < contigLength){ - probKmerTable.put(probKmer, index-1); - } - } - } - - contigList.add(contigString); - } - - for (int i = 0; i< contigList.size(); i++){ - String contigAgain = contigList.get(i); - // String RCcontigAgain = reverseComplement(contigAgain); - for (int j=0; j< contigAgain.length()-param.kmerSize;j++){ - String kmerSearch = contigAgain.substring(j, j+param.kmerSize); - // String kmerRCSearch = RCcontigAgain.substring(j, j+param.kmerSize); - if (probKmerTable.containsKey(kmerSearch)){ - if (contigAgain.length() > contigList.get(probKmerTable.get(kmerSearch)).length()){ - redundantTable.put(probKmerTable.get(kmerSearch), true); - - if (contigList.get(probKmerTable.get(kmerSearch)).length() > contigAgain.length() -j){ - if (overlapTableRight.containsKey(i)){ - if ( (int)(overlapTableRight.get(i) & maxContigLengthBinary) < contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() -j)){ - Long extension = 1L << 30; // right extension - extension |= (1L << 62); // not reverse complement - extension |= ( contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() -j) ); - extension |= ((long) probKmerTable.get(kmerSearch) << 32); - overlapTableRight.put(i, extension); - // System.out.println("longer + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); - }else{ - // longer extension already exist - } - }else{ - Long extension = 1L << 30; // right extension - extension |= (1L << 62); // not reverse complement - extension |= ( contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() -j) ); - extension |= ((long) probKmerTable.get(kmerSearch) << 32); - overlapTableRight.put(i, extension); - // System.out.println("right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); - } - } - - }else if (contigAgain.length() == contigList.get(probKmerTable.get(kmerSearch)).length()){ // - // find itself. - if (i == probKmerTable.get(kmerSearch)){ - //itself - }else{ - // two identical contigs - if (i < probKmerTable.get(kmerSearch)) { - redundantTable.put(probKmerTable.get(kmerSearch), true); - - if (j>0) { - if (overlapTableRight.containsKey(i)) { - if ((int) (overlapTableRight.get(i) & maxContigLengthBinary) < contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)) { - Long extension = 1L << 30; // right extension - extension |= (1L << 62); // not reverse complement - extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); - extension |= ((long) probKmerTable.get(kmerSearch) << 32); - overlapTableRight.put(i, extension); - // System.out.println("longer + equal + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); - } else { - // longer extension already exist - } - } else { - Long extension = 1L << 30; // right extension - extension |= (1L << 62); // not reverse complement - extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); - extension |= ((long) probKmerTable.get(kmerSearch) << 32); - overlapTableRight.put(i, extension); - // System.out.println("equal + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); - } - } - } else { - redundantTable.put(i, true); - if (j>0) { - if (overlapTable.containsKey(probKmerTable.get(kmerSearch))) { - if ((int) (overlapTable.get(probKmerTable.get(kmerSearch)) & maxContigLengthBinary) < contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)) { - // Long extension = 0L << 30; // left extension - Long extension = (1L << 62); // not reverse complement - extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("longer + equal + left + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); - } else { - // longer extension already exist - } - } else { - // Long extension = 0L << 30; // left extension - Long extension = (1L << 62); // not reverse complement - extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("equal + left + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); - } - } - - break; - } - } - } else { // contigAgain.length() < probKmerTable.get(kmerSearch) - redundantTable.put(i, true); - - if (j>0){ - if (overlapTable.containsKey(probKmerTable.get(kmerSearch))) { - if (j > (int) (overlapTable.get(probKmerTable.get(kmerSearch)) & maxContigLengthBinary)) { - // Long extension = 0L << 30; // left extension - Long extension = 1L << 62; // not reverse complement - extension |= ((long) j); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("longer + left + forward: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerSearch))); - } else { - - } - }else{ - // Long extension = 0L << 30; // left extension - Long extension = 1L << 62; // not reverse complement - extension |= ((long) j); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("left + forward: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerSearch))); - } - } - - break; - // not adding, removed - } - } -/* - if (probKmerTable.containsKey(kmerRCSearch)){ - if (contigAgain.length() > contigList.get(probKmerTable.get(kmerRCSearch)).length()){ - redundantTable.put(probKmerTable.get(kmerRCSearch), true); - - if (contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize > j){ - if (overlapTable.containsKey(i)){ - if ( (int)(overlapTable.get(i) & maxContigLengthBinary) < contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize - j ){ - // Long extension = 0L << 30; // left extension - // extension |= (0L << 62); // reverse complement - Long extension = (long) ( contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize - j ); - extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); - overlapTable.put(i, extension); - // System.out.println("longer + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); - }else{ - // longer extension already exist - } - }else{ - // Long extension = 0L << 30; // left extension - //reverse complement - Long extension = (long) ( contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize - j ); - extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); - overlapTable.put(i, extension); - // System.out.println("right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); - } - } - - }else if (contigAgain.length() == contigList.get(probKmerTable.get(kmerRCSearch)).length()){ // - if (i< probKmerTable.get(kmerRCSearch)){ - redundantTable.put(probKmerTable.get(kmerRCSearch), true); - - if (j>0) { - if (overlapTableRight.containsKey(i)) { - if ((int) (overlapTableRight.get(i) & maxContigLengthBinary) < j) { - Long extension = 1L << 30; // right extension - // extension |= (0L << 62); // reverse complement - extension = (long) (j); - extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); - overlapTableRight.put(i, extension); - // System.out.println("longer + equal + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); - } else { - // longer extension already exist - } - } else { - Long extension = 1L << 30; // Right extension - //reverse complement - extension = (long) (j); - extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); - overlapTableRight.put(i, extension); - // System.out.println("equal + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); - } - } - }else{ - redundantTable.put(i, true); - - if (j>0) { - if (overlapTable.containsKey(probKmerTable.get(kmerRCSearch))) { - if ((int) (overlapTable.get(probKmerTable.get(kmerRCSearch)) & maxContigLengthBinary) < j) { - // Long extension = 0L << 30; // left extension - // extension |= (0L << 62); // reverse complement - Long extension = (long) (j); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("longer + equal + left + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); - } else { - // longer extension already exist - } - } else { - // Long extension = 0L << 30; // left extension - //reverse complement - Long extension = (long) (j); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("equal + left + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); - } - } - - break; - } - } else { // contigAgain.length() < probKmerTable.get(kmerRCSearch) - redundantTable.put(i, true); - - if ( j >0){ - if (overlapTable.containsKey(probKmerTable.get(kmerRCSearch))) { - if (j > (int) (overlapTable.get(probKmerTable.get(kmerRCSearch)) & maxContigLengthBinary)) { - // Long extension = 0L << 30; // left extension - // Long extension = 0L << 62; not reverse complement - Long extension = ((long) j); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("longer + left + reverse: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerRCSearch))); - } else { - - } - }else{ - // Long extension = 0L << 30; // left extension - // Long extension = 0L << 62; not reverse complement - Long extension = ((long) j); - extension |= ((long) i << 32); - overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("left + reverse: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerRCSearch))); - } - } - - break; - } - } - */ - } - - } - - for (int i =0; i< contigList.size(); i++){ - if (overlapTable.containsKey(i)){ - Long extension = overlapTable.get(i); - int direction = (int) ((extension >>> 30) & 3L); - int RC = (int) ((extension >>> 62) & 3L); - int contigIndex = (int) ((extension >>> 32) & maxContigLengthBinary); - // int extensionIndex = (int) (extension & maxContigLengthBinary); not use here - if (RC == 0){ // reverse complement - if (direction == 0){ // left extension - String contig = reverseComplement(contigList.get(contigIndex)); - int kmerIndex = contig.indexOf(contigList.get(i).substring(0, 2*param.kmerSize)); - if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); - continue; - } - String fragment = contig.substring(0, kmerIndex); - // System.out.println("extension left + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, fragment + contigList.get(i)); - - }else { // right extension - String contig = reverseComplement(contigList.get(contigIndex)); - int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-2*param.kmerSize)); - if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); - continue; - } - String fragment = contig.substring(kmerIndex+2*param.kmerSize); - // System.out.println("extension right + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, contigList.get(i) + fragment); - } - }else { // not reverse complement - if (direction == 0){ // left extension - String contig = contigList.get(contigIndex); - int kmerIndex = contig.indexOf(contigList.get(i).substring(0, 2*param.kmerSize)); - if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); - continue; - } - String fragment = contig.substring(0, kmerIndex); - // System.out.println("extension left + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, fragment + contigList.get(i)); - }else { // right extension - String contig = contigList.get(contigIndex); - int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-2*param.kmerSize)); - if (kmerIndex == -1){ -// System.out.println(contig); -// System.out.println(contigList.get(i)); - continue; - } - String fragment = contig.substring(kmerIndex+2*param.kmerSize); - // System.out.println("extension right + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, contigList.get(i) + fragment); - } - } - } - - if (overlapTableRight.containsKey(i)){ - Long extension = overlapTableRight.get(i); - int direction = (int) ((extension >>> 30) & 3L); - int RC = (int) ((extension >>> 62) & 3L); - int contigIndex = (int) ((extension >>> 32) & maxContigLengthBinary); - // int extensionIndex = (int) (extension & maxContigLengthBinary); not use here - if (RC == 0){ // reverse complement - if (direction == 0){ // left extension - String contig = reverseComplement(contigList.get(contigIndex)); - int kmerIndex = contig.indexOf(contigList.get(i).substring(0, 2*param.kmerSize)); - if (kmerIndex == -1){ -// System.out.println(contig); -// System.out.println(contigList.get(i)); - continue; - } - String fragment = contig.substring(0, kmerIndex); - // System.out.println("extension left + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, fragment + contigList.get(i)); - - }else { // right extension - String contig = reverseComplement(contigList.get(contigIndex)); - int kmerIndex = contigList.get(i).indexOf(contig.substring(0, 2*param.kmerSize)); - // int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-param.kmerSize)); - if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); - continue; - } - - kmerIndex = contigList.get(i).length() - kmerIndex; - if (contig.length() <= kmerIndex){ - continue; - } - - String fragment = contig.substring(kmerIndex); - // System.out.println("extension right + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, contigList.get(i) + fragment); - } - }else { // not reverse complement - if (direction == 0){ // left extension - String contig = contigList.get(contigIndex); - int kmerIndex = contig.indexOf(contigList.get(i).substring(0, 2*param.kmerSize)); - if (kmerIndex == -1){ -// System.out.println(contig); -// System.out.println(contigList.get(i)); - continue; - } - String fragment = contig.substring(0, kmerIndex); -// System.out.println("extension left + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, fragment + contigList.get(i)); - }else { // right extension - String contig = contigList.get(contigIndex); - int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-2*param.kmerSize)); - if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); - continue; - } - String fragment = contig.substring(kmerIndex+2*param.kmerSize); - // System.out.println("extension right + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); - contigList.set(i, contigList.get(i) + fragment); - } - } - } - } - - for (int i =0; i< contigList.size(); i++){ - if (!redundantTable.containsKey(i)){ - uniqueContig.add(RowFactory.create(contigList.get(i))); - } - } - - - return uniqueContig.iterator(); - } - - private String reverseComplement(String forward){ - String reverseComplementNucleotides; - - - char[] nucleotides = forward.toCharArray(); - int nucleotideNum = nucleotides.length; - char[] nucleotidesRC = new char[nucleotideNum]; - - for (int i=0; i, Serializable{ + class DSMergeRedundantNonRCContigs implements MapPartitionsFunction, Serializable{ List uniqueContig = new ArrayList(); List contigList = new ArrayList(); Row contig; @@ -1084,10 +488,10 @@ public Iterator call (Iterator sIterator){ for (int i = 0; i< contigList.size(); i++){ String contigAgain = contigList.get(i); - String RCcontigAgain = reverseComplement(contigAgain); + // String RCcontigAgain = reverseComplement(contigAgain); for (int j=0; j< contigAgain.length()-param.kmerSize;j++){ String kmerSearch = contigAgain.substring(j, j+param.kmerSize); - String kmerRCSearch = RCcontigAgain.substring(j, j+param.kmerSize); + // String kmerRCSearch = RCcontigAgain.substring(j, j+param.kmerSize); if (probKmerTable.containsKey(kmerSearch)){ if (contigAgain.length() > contigList.get(probKmerTable.get(kmerSearch)).length()){ redundantTable.put(probKmerTable.get(kmerSearch), true); @@ -1100,7 +504,7 @@ public Iterator call (Iterator sIterator){ extension |= ( contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() -j) ); extension |= ((long) probKmerTable.get(kmerSearch) << 32); overlapTableRight.put(i, extension); - // System.out.println("longer + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); + // System.out.println("longer + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); }else{ // longer extension already exist } @@ -1110,7 +514,7 @@ public Iterator call (Iterator sIterator){ extension |= ( contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() -j) ); extension |= ((long) probKmerTable.get(kmerSearch) << 32); overlapTableRight.put(i, extension); - // System.out.println("right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); + // System.out.println("right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); } } @@ -1131,7 +535,7 @@ public Iterator call (Iterator sIterator){ extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); extension |= ((long) probKmerTable.get(kmerSearch) << 32); overlapTableRight.put(i, extension); - // System.out.println("longer + equal + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); + // System.out.println("longer + equal + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); } else { // longer extension already exist } @@ -1141,7 +545,7 @@ public Iterator call (Iterator sIterator){ extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); extension |= ((long) probKmerTable.get(kmerSearch) << 32); overlapTableRight.put(i, extension); - // System.out.println("equal + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); + // System.out.println("equal + right + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); } } } else { @@ -1154,7 +558,7 @@ public Iterator call (Iterator sIterator){ extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("longer + equal + left + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); + // System.out.println("longer + equal + left + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); } else { // longer extension already exist } @@ -1164,7 +568,7 @@ public Iterator call (Iterator sIterator){ extension |= (contigList.get(probKmerTable.get(kmerSearch)).length() - (contigAgain.length() - j)); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("equal + left + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); + // System.out.println("equal + left + forward: " + contigList.get(probKmerTable.get(kmerSearch)) + " " + contigList.get(i)); } } @@ -1182,7 +586,7 @@ public Iterator call (Iterator sIterator){ extension |= ((long) j); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("longer + left + forward: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerSearch))); + // System.out.println("longer + left + forward: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerSearch))); } else { } @@ -1192,7 +596,7 @@ public Iterator call (Iterator sIterator){ extension |= ((long) j); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerSearch), extension); - // System.out.println("left + forward: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerSearch))); + // System.out.println("left + forward: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerSearch))); } } @@ -1200,7 +604,7 @@ public Iterator call (Iterator sIterator){ // not adding, removed } } - +/* if (probKmerTable.containsKey(kmerRCSearch)){ if (contigAgain.length() > contigList.get(probKmerTable.get(kmerRCSearch)).length()){ redundantTable.put(probKmerTable.get(kmerRCSearch), true); @@ -1208,22 +612,22 @@ public Iterator call (Iterator sIterator){ if (contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize > j){ if (overlapTable.containsKey(i)){ if ( (int)(overlapTable.get(i) & maxContigLengthBinary) < contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize - j ){ - // Long extension = 0L << 30; // left extension - // extension |= (0L << 62); // reverse complement + // Long extension = 0L << 30; // left extension + // extension |= (0L << 62); // reverse complement Long extension = (long) ( contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize - j ); extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); overlapTable.put(i, extension); - // System.out.println("longer + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); + // System.out.println("longer + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); }else{ // longer extension already exist } }else{ - // Long extension = 0L << 30; // left extension + // Long extension = 0L << 30; // left extension //reverse complement Long extension = (long) ( contigList.get(probKmerTable.get(kmerRCSearch)).length() - param.kmerSize - j ); extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); overlapTable.put(i, extension); - // System.out.println("right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); + // System.out.println("right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); } } @@ -1239,7 +643,7 @@ public Iterator call (Iterator sIterator){ extension = (long) (j); extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); overlapTableRight.put(i, extension); - // System.out.println("longer + equal + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); + // System.out.println("longer + equal + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); } else { // longer extension already exist } @@ -1249,7 +653,7 @@ public Iterator call (Iterator sIterator){ extension = (long) (j); extension |= ((long) probKmerTable.get(kmerRCSearch) << 32); overlapTableRight.put(i, extension); - // System.out.println("equal + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); + // System.out.println("equal + right + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); } } }else{ @@ -1263,7 +667,7 @@ public Iterator call (Iterator sIterator){ Long extension = (long) (j); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("longer + equal + left + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); + // System.out.println("longer + equal + left + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); } else { // longer extension already exist } @@ -1273,7 +677,7 @@ public Iterator call (Iterator sIterator){ Long extension = (long) (j); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("equal + left + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); + // System.out.println("equal + left + reverse: " + contigList.get(probKmerTable.get(kmerRCSearch)) + " " + contigList.get(i)); } } @@ -1285,28 +689,29 @@ public Iterator call (Iterator sIterator){ if ( j >0){ if (overlapTable.containsKey(probKmerTable.get(kmerRCSearch))) { if (j > (int) (overlapTable.get(probKmerTable.get(kmerRCSearch)) & maxContigLengthBinary)) { - // Long extension = 0L << 30; // left extension - // Long extension = 0L << 62; not reverse complement + // Long extension = 0L << 30; // left extension + // Long extension = 0L << 62; not reverse complement Long extension = ((long) j); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("longer + left + reverse: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerRCSearch))); + // System.out.println("longer + left + reverse: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerRCSearch))); } else { } }else{ - // Long extension = 0L << 30; // left extension + // Long extension = 0L << 30; // left extension // Long extension = 0L << 62; not reverse complement Long extension = ((long) j); extension |= ((long) i << 32); overlapTable.put(probKmerTable.get(kmerRCSearch), extension); - // System.out.println("left + reverse: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerRCSearch))); + // System.out.println("left + reverse: " + contigList.get(i) + " " + contigList.get(probKmerTable.get(kmerRCSearch))); } } break; } } + */ } } @@ -1317,30 +722,30 @@ public Iterator call (Iterator sIterator){ int direction = (int) ((extension >>> 30) & 3L); int RC = (int) ((extension >>> 62) & 3L); int contigIndex = (int) ((extension >>> 32) & maxContigLengthBinary); - // int extensionIndex = (int) (extension & maxContigLengthBinary); not use here + // int extensionIndex = (int) (extension & maxContigLengthBinary); not use here if (RC == 0){ // reverse complement if (direction == 0){ // left extension String contig = reverseComplement(contigList.get(contigIndex)); int kmerIndex = contig.indexOf(contigList.get(i).substring(0, 2*param.kmerSize)); if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); + // System.out.println(contig); + // System.out.println(contigList.get(i)); continue; } String fragment = contig.substring(0, kmerIndex); - // System.out.println("extension left + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); + // System.out.println("extension left + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); contigList.set(i, fragment + contigList.get(i)); }else { // right extension String contig = reverseComplement(contigList.get(contigIndex)); int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-2*param.kmerSize)); if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); + // System.out.println(contig); + // System.out.println(contigList.get(i)); continue; } String fragment = contig.substring(kmerIndex+2*param.kmerSize); - // System.out.println("extension right + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); + // System.out.println("extension right + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); contigList.set(i, contigList.get(i) + fragment); } }else { // not reverse complement @@ -1348,12 +753,12 @@ public Iterator call (Iterator sIterator){ String contig = contigList.get(contigIndex); int kmerIndex = contig.indexOf(contigList.get(i).substring(0, 2*param.kmerSize)); if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); + // System.out.println(contig); + // System.out.println(contigList.get(i)); continue; } String fragment = contig.substring(0, kmerIndex); - // System.out.println("extension left + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); + // System.out.println("extension left + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); contigList.set(i, fragment + contigList.get(i)); }else { // right extension String contig = contigList.get(contigIndex); @@ -1364,7 +769,7 @@ public Iterator call (Iterator sIterator){ continue; } String fragment = contig.substring(kmerIndex+2*param.kmerSize); - // System.out.println("extension right + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); + // System.out.println("extension right + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); contigList.set(i, contigList.get(i) + fragment); } } @@ -1386,16 +791,16 @@ public Iterator call (Iterator sIterator){ continue; } String fragment = contig.substring(0, kmerIndex); - // System.out.println("extension left + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); + // System.out.println("extension left + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); contigList.set(i, fragment + contigList.get(i)); }else { // right extension String contig = reverseComplement(contigList.get(contigIndex)); int kmerIndex = contigList.get(i).indexOf(contig.substring(0, 2*param.kmerSize)); - // int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-param.kmerSize)); + // int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-param.kmerSize)); if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); + // System.out.println(contig); + // System.out.println(contigList.get(i)); continue; } @@ -1405,7 +810,7 @@ public Iterator call (Iterator sIterator){ } String fragment = contig.substring(kmerIndex); - // System.out.println("extension right + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); + // System.out.println("extension right + reverse: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); contigList.set(i, contigList.get(i) + fragment); } }else { // not reverse complement @@ -1424,12 +829,12 @@ public Iterator call (Iterator sIterator){ String contig = contigList.get(contigIndex); int kmerIndex = contig.indexOf(contigList.get(i).substring(contigList.get(i).length()-2*param.kmerSize)); if (kmerIndex == -1){ - // System.out.println(contig); - // System.out.println(contigList.get(i)); + // System.out.println(contig); + // System.out.println(contigList.get(i)); continue; } String fragment = contig.substring(kmerIndex+2*param.kmerSize); - // System.out.println("extension right + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); + // System.out.println("extension right + forward: " + kmerIndex + " " + contigList.get(contigIndex) + " " + contigList.get(i)); contigList.set(i, contigList.get(i) + fragment); } } @@ -1660,7 +1065,6 @@ public char complementary (char a){ } - class DSContigInputParser implements MapPartitionsFunction, Serializable{ Random r = new Random(); @@ -1693,9 +1097,6 @@ public Iterator call (Iterator sIterator){ } } - - - class DSKmerToContigLength implements MapPartitionsFunction, Serializable{ public Iterator call (Iterator sIterator){ List contigList = new ArrayList(); @@ -1769,112 +1170,6 @@ public String changeLine(String oneLine, int lineLength, int limitedLength){ } } - - class DSKmerToContig implements MapPartitionsFunction, Serializable{ - - public Iterator call (Iterator sIterator){ - List contigList = new ArrayList(); - - while (sIterator.hasNext()) { - Row s = sIterator.next(); - if (s.getInt(1) == 1) { - String contig = s.getString(0) + s.getString(2); - int length = contig.length(); - if (length >= param.minContig) { - String ID = ">Contig-" + length; - String formatedContig = changeLine(contig, length, 100); - contigList.add(RowFactory.create(ID, formatedContig)); - } - } else { // (randomReflexivMarker == 2) { - String contig = s.getString(2) + s.getString(0); - int length = contig.length(); - if (length >= param.minContig) { - String ID = ">Contig-" + length; - String formatedContig = changeLine(contig, length, 100); - contigList.add(RowFactory.create(ID, formatedContig)); - } - } - } - - return contigList.iterator(); - } - - public String changeLine(String oneLine, int lineLength, int limitedLength){ - String blockLine = ""; - int fold = lineLength / limitedLength; - int remainder = lineLength % limitedLength; - if (fold ==0) { - blockLine = oneLine; - }else if (fold == 1 && remainder == 0){ - blockLine = oneLine; - }else if (fold >1 && remainder == 0){ - for (int i =0 ; i, Serializable{ - List reflexivKmerStringList = new ArrayList(); - - public Iterator call(Iterator sIterator){ - while (sIterator.hasNext()){ - String subKmer = ""; - String subString =""; - Row s = sIterator.next(); - int currentSuffixLength = Long.SIZE/2 - (Long.numberOfLeadingZeros(s.getLong(2))/2 + 1); - for (int i=1; i<=param.subKmerSize;i++){ - Long currentNucleotideBinary = s.getLong(0) >>> 2*(param.subKmerSize - i); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - subKmer += currentNucleotide; - } - - for (int i=1; i<=currentSuffixLength; i++){ - Long currentNucleotideBinary = s.getLong(2) >>> 2*(currentSuffixLength - i); - currentNucleotideBinary &= 3L; - char currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - subString += currentNucleotide; - } - - reflexivKmerStringList.add ( - RowFactory.create( - subKmer, s.getInt(1), subString, s.getInt(3), s.getInt(4)) - ); - } - return reflexivKmerStringList.iterator(); - } - - private char BinaryToNucleotide (Long twoBits){ - char nucleotide; - if (twoBits == 0){ - nucleotide = 'A'; - }else if (twoBits == 1){ - nucleotide = 'C'; - }else if (twoBits == 2){ - nucleotide = 'G'; - }else{ - nucleotide = 'T'; - } - return nucleotide; - } - } - /** * */ @@ -1948,8 +1243,6 @@ private char BinaryToNucleotide(Long twoBits) { /** * */ - - class DSExtendReflexivKmerToArrayLoop implements MapPartitionsFunction, Serializable{ /* marker to identify similar SubKmers in the loop sequence */ @@ -2733,9 +2026,6 @@ public void tmpKmerRandomizer(){ /** * */ - - - class DSExtendReflexivKmerToArrayFirstTime implements MapPartitionsFunction, Serializable{ /* marker to identify similar SubKmers in the loop sequence */ @@ -3168,8 +2458,6 @@ public void tmpKmerRandomizer(){ /** * */ - - class DSExtendReflexivKmer implements MapPartitionsFunction, Serializable{ /* marker to identify similar SubKmers in the loop sequence */ @@ -4004,40 +3292,6 @@ public Iterator call(Iterator s){ return kmerList.iterator(); } } - - class DSKmerReverseComplementLong implements MapPartitionsFunction, Serializable{ - /* a capsule for all Kmers and reverseComplementKmers */ - List kmerList = new ArrayList(); - Long reverseComplement; - Row kmerTuple; - Long lastTwoBits; - Long kmerBinary; - - - public Iterator call(Iterator s){ - - - while (s.hasNext()) { - kmerTuple = s.next(); - kmerBinary = kmerTuple.getLong(0); - reverseComplement=0L; - for (int i = 0; i < param.kmerSize; i++) { - reverseComplement<<=2; - - lastTwoBits = kmerBinary & 3L ^ 3L; - kmerBinary >>>=2; - reverseComplement|=lastTwoBits; - } - - kmerList.add(RowFactory.create(kmerTuple.getLong(0), (int)kmerTuple.getLong(1))); - kmerList.add(RowFactory.create(reverseComplement, (int)kmerTuple.getLong(1))); - } - - return kmerList.iterator(); - } - } - - class KmerBinarizer implements MapPartitionsFunction, Serializable { List kmerList = new ArrayList(); @@ -4116,138 +3370,6 @@ private long nucleotideValue(char a) { } - - - - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable{ - long maxKmerBits= ~((~0L) << (2*param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s){ - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - - /** - * interface class for RDD implementation, used in step 3 - * ----------- - * ------ - * ------ - * ------ - * ------ - * ------ - * ------ - */ - - - class DSFastqUnitFilter implements FilterFunction, Serializable{ - public boolean call(String s){ - return s != null; - } - } - - /** - * interface class for RDD implementation, Used in step 1 - */ - - - class DSFastqFilterWithQual implements MapFunction, Serializable{ - String line = ""; - int lineMark = 0; - public String call(String s) { - if (lineMark == 2) { - lineMark++; - line = line + "\n" + s; - return null; - } else if (lineMark == 3) { - lineMark++; - line = line + "\n" + s; - return line; - } else if (s.startsWith("@")) { - line = s; - lineMark = 1; - return null; - } else if (lineMark == 1) { - line = line + "\n" + s; - lineMark++; - return null; - }else{ - return null; - } - } - } - - /** - * interface class for RDD implementation, used in step 2 - */ - - /** * * @param param diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSReAssembler64.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSReAssembler64.java index fcf1cc4..b57cd98 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSReAssembler64.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSReAssembler64.java @@ -129,12 +129,7 @@ public void assembly() { kmerCountTupleStruct = kmerCountTupleStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); kmerCountTupleStruct = kmerCountTupleStruct.add("count", DataTypes.IntegerType, false); ExpressionEncoder KmerBinaryCountEncoder = RowEncoder.apply(kmerCountTupleStruct); -/* - StructType kmerBinaryStruct = new StructType(); - kmerBinaryStruct = kmerBinaryStruct.add("kmerBlocks", DataTypes.createArrayType(DataTypes.LongType), false); - kmerBinaryStruct = kmerBinaryStruct.add("count", DataTypes.IntegerType, false); - ExpressionEncoder kmerBinaryEncoder = RowEncoder.apply(kmerBinaryStruct); -*/ + Dataset ReflexivSubKmerDS; StructType ReflexivKmerStruct = new StructType(); ReflexivKmerStruct = ReflexivKmerStruct.add("k-1", DataTypes.createArrayType(DataTypes.LongType), false); @@ -532,10 +527,6 @@ public void assemblyFromKmer() { iterations++; - //ReflexivSubKmerStringDS= ReflexivSubKmerDS.mapPartitions(StringOutputDS, ReflexivKmerStringEncoder); - // ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); - //ReflexivSubKmerStringDS.write().format("csv").save(param.outputPath + iterations); - /** * Extract Long sub kmer */ @@ -590,19 +581,8 @@ public void assemblyFromKmer() { ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); -// ReflexivLongSubKmerDS.cache(); -// ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); -// ReflexivLongSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringDS= ReflexivLongSubKmerDS.mapPartitions(StringOutputDS, reflexivKmerStringEncoder); -// ReflexivSubKmerStringDS.toJavaRDD().saveAsTextFile(param.outputPath + iterations); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations); - ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.mapPartitions(DSKmerExtenstionArrayToArray, ReflexivLongKmerEncoder); -// ReflexivSubKmerStringRDD = ReflexivLongSubKmerRDD.mapPartitionsToPair(ArrayStringOutput); -// ReflexivSubKmerStringRDD.saveAsTextFile(param.outputPath + iterations + "Extend"); - } ReflexivLongSubKmerDS = ReflexivLongSubKmerDS.sort("k-1"); @@ -614,48 +594,9 @@ public void assemblyFromKmer() { ReflexivLongSubKmerStringDS = ReflexivLongSubKmerDS.mapPartitions(DSArrayStringOutput, ReflexivLongKmerStringEncoder); - /** - * - */ - // DSKmerToContigLength contigLengthDS = new DSKmerToContigLength(); - // ContigLengthRows = ReflexivLongSubKmerStringDS.mapPartitions(contigLengthDS, ContigLengthEncoder); - - - // DSFormatContigs ContigFormater = new DSFormatContigs(); - // ContigRows= ContigMergedRow.mapPartitions(ContigFormater, ContigStringEncoder); DSKmerToContig contigformaterDS = new DSKmerToContig(); ContigRows = ReflexivLongSubKmerStringDS.mapPartitions(contigformaterDS, ContigStringEncoder); -/* DSKmerToContigString contigStringerDS = new DSKmerToContigString(); - NewContigDS = ReflexivLongSubKmerStringDS.mapPartitions(contigStringerDS, Encoders.STRING()); - - DSContigInputParser contigParser = new DSContigInputParser(); - - ContigDS= ContigDS.union(NewContigDS); - - ContigDS.cache(); - - ContigLengthRows = ContigDS.mapPartitions(contigParser, ContigLengthEncoder); - ContigLengthRows = ContigLengthRows.sort("length"); - - ContigLengthRows = ContigLengthRows.coalesce(1); - - DSMergeRedundantContigs RedundantMerger = new DSMergeRedundantContigs(); - DSMergeRedundantNonRCContigs RedundantNonRCMerger = new DSMergeRedundantNonRCContigs(); - if (param.RCmerge){ - ContigMergedRow = ContigLengthRows.mapPartitions(RedundantMerger, ContigMergedEncoder); - } else { - ContigMergedRow = ContigLengthRows.mapPartitions(RedundantNonRCMerger, ContigMergedEncoder); - } - - ContigMergedRow = ContigMergedRow.repartition(param.partitions); - - DSFormatContigs ContigFormater = new DSFormatContigs(); - - ContigRows = ContigMergedRow.mapPartitions(ContigFormater, ContigStringEncoder); -*/ - // DSKmerToContig contigformaterDS = new DSKmerToContig(); - // ContigRows = ReflexivLongSubKmerStringDS.mapPartitions(contigformaterDS, ContigStringEncoder); /** * diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSStitchingLonger.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSStitchingLonger.java index 4f5ff54..f8b3b95 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSStitchingLonger.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDSStitchingLonger.java @@ -294,10 +294,7 @@ private long[] leftShiftArray(long[] blocks, int shiftingLength) throws Exceptio // if (relativeShiftSize ==0) then only shifting blocks int j=0; // new index for shifted blocks - // long oldShiftOut=0L; // if only one block, then 0 bits -// if (blocks.length-(startingBlockIndex+1) >=1) { // more than one block, newBlock.length = blocks.length-startingBlockIndex -// oldShiftOut = blocks[startingBlockIndex + 1] >>> 2 * (32 - relativeShiftSize); - // } + for (int i=startingBlockIndex; i>> 2*(31-relativeShiftSize); // ooooxxxxxxx -> -------oooo o=shift out x=needs to be left shifted newBlock[j]= blocks[i] << 2*relativeShiftSize; // 00000xxxxx -> xxxxx----- diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter.java index d396f12..5645e1c 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter.java @@ -160,8 +160,7 @@ public void assembly() throws IOException { if (param.inputFormat.equals("4mc")){ Configuration baseConfiguration = new Configuration(); - // baseConfiguration.setInt("mapred.min.split.size", 6000000); - // baseConfiguration.setInt("mapred.max.split.size", 6000000); + Job jobConf = Job.getInstance(baseConfiguration); // sc.hadoopConfiguration().setInt("mapred.max.split.size", 6000000); JavaPairRDD FastqPairRDD = sc.newAPIHadoopFile(param.inputFqPath, FourMcTextInputFormat.class, LongWritable.class, Text.class, jobConf.getConfiguration()); @@ -186,15 +185,6 @@ public void assembly() throws IOException { DSFastqFilterOnlySeq DSFastqFilterToSeq = new DSFastqFilterOnlySeq(); // for reflexiv FastqDS = FastqDS.mapPartitions(DSFastqFilterToSeq, Encoders.STRING()); } - - /* - DSFastqFilterWithQual DSFastqFilter = new DSFastqFilterWithQual(); - FastqDS = FastqDS.map(DSFastqFilter, Encoders.STRING()); - - DSFastqUnitFilter FilterDSUnit = new DSFastqUnitFilter(); - - FastqDS = FastqDS.filter(FilterDSUnit); - */ } @@ -452,41 +442,6 @@ private char BinaryToNucleotide (Long twoBits){ } } - /** - * - */ - class DSFastqUnitFilter implements FilterFunction, Serializable{ - public boolean call(String s){ - return s != null; - } - } - - /** - * - */ - class DSFastqFilterWithQual implements MapFunction, Serializable{ - String line = ""; - int lineMark = 0; - public String call(String s) { - if (lineMark == 2) { - lineMark++; - return null; - } else if (lineMark == 3) { - lineMark++; - return line; - } else if (s.startsWith("@")) { - lineMark = 1; - return null; - } else if (lineMark == 1) { - line = s; - lineMark++; - return null; - }else{ - return null; - } - } - } - /** * */ diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter64.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter64.java index 355ad11..21fe07c 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter64.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameCounter64.java @@ -186,14 +186,6 @@ public void assembly() throws IOException { FastqDS = FastqDS.mapPartitions(DSFastqFilterToSeq, Encoders.STRING()); } - /* - DSFastqFilterWithQual DSFastqFilter = new DSFastqFilterWithQual(); - FastqDS = FastqDS.map(DSFastqFilter, Encoders.STRING()); - - DSFastqUnitFilter FilterDSUnit = new DSFastqUnitFilter(); - - FastqDS = FastqDS.filter(FilterDSUnit); -*/ } if (param.cache) { @@ -208,15 +200,6 @@ public void assembly() throws IOException { ReverseComplementKmerBinaryExtractionFromDataset64 DSExtractRCKmerBinaryFromFastq = new ReverseComplementKmerBinaryExtractionFromDataset64(); KmerBinaryDS = FastqDS.mapPartitions(DSExtractRCKmerBinaryFromFastq, kmerBinaryEncoder); - // PrintElement ElementPrinter = new PrintElement(); - // KmerBinaryDS = KmerBinaryDS.mapPartitions(ElementPrinter, kmerBinaryEncoder); - - - // KmerBinaryDS = KmerBinaryDS.sort("kmerBlocks"); - // KmerBinaryDS.cache(); - - // KmerBinaryDS = KmerBinaryDS.mapPartitions(ElementPrinter, kmerBinaryEncoder); - DFKmerBinaryCount = KmerBinaryDS.groupBy("kmerBlocks") .count() .toDF("kmerBlocks","count"); @@ -259,8 +242,6 @@ public void assembly() throws IOException { class DSFastqFilterOnlySeq implements MapPartitionsFunction, Serializable{ ArrayList seqArray = new ArrayList(); - //String line; - //int lineMark = 0; public Iterator call(Iterator sIterator) { while (sIterator.hasNext()) { @@ -309,48 +290,6 @@ private boolean checkSeq(char a){ return false; } } - - /* - public Iterator call(Iterator sIterator) { - while (sIterator.hasNext()) { - String s = sIterator.next(); - if (lineMark == 2) { - lineMark++; - } else if (lineMark == 3) { - lineMark++; - seqArray.add(line); - } else if (s.startsWith("@")) { - lineMark = 1; - } else if (lineMark == 1) { - line = s; - lineMark++; - } - } - - return seqArray.iterator(); - } - */ - -/* - public String call(String s) { - if (lineMark == 2) { - lineMark++; - return null; - } else if (lineMark == 3) { - lineMark++; - return line; - } else if (s.startsWith("@")) { - lineMark = 1; - return null; - } else if (lineMark == 1) { - line = s; - lineMark++; - return null; - }else{ - return null; - } - } - */ } @@ -363,27 +302,7 @@ public Iterator call(Iterator> sIterator) thr Tuple2 s = sIterator.next(); seq = s._2().toString(); -/* - if (seq.length()<= 20) { - continue; - } else if (seq.startsWith("@")) { - continue; - } else if (seq.startsWith("+")) { - continue; - } else if (!checkSeq(seq.charAt(0))) { - continue; - } else if (!checkSeq(seq.charAt(4))){ - continue; - } else if (!checkSeq(seq.charAt(9))){ - continue; - } else if (!checkSeq(seq.charAt(14))){ - continue; - } else if (!checkSeq(seq.charAt(19))){ - continue; - } else { - reflexivKmerStringList.add(seq); - } -*/ + reflexivKmerStringList.add(seq); } return reflexivKmerStringList.iterator(); @@ -467,59 +386,6 @@ private char BinaryToNucleotide (Long twoBits){ } } - /** - * - */ - class DSFastqUnitFilter implements FilterFunction, Serializable{ - public boolean call(String s){ - return s != null; - } - } - - /** - * - */ - class DSFastqFilterWithQual implements MapFunction, Serializable{ - String line = ""; - int lineMark = 0; - public String call(String s) { - if (lineMark == 2) { - lineMark++; - return null; - } else if (lineMark == 3) { - lineMark++; - return line; - } else if (s.startsWith("@")) { - lineMark = 1; - return null; - } else if (lineMark == 1) { - line = s; - lineMark++; - return null; - }else{ - return null; - } - } - } - - class PrintElement implements MapPartitionsFunction, Serializable{ - List theSameList = new ArrayList(); - - public Iterator call(Iterator s){ - while (s.hasNext()){ - Row sIterator = s.next(); - // Long a = (Long)sIterator.getSeq(0).apply(0); - // Long b = (Long)sIterator.getSeq(0).apply(1); - - // System.out.println(sIterator.getList(0).get(0)); - // System.out.println(sIterator.getInt(1)); - - theSameList.add(sIterator); - } - return theSameList.iterator(); - } - } - /** * */ @@ -862,87 +728,6 @@ private Long[] shiftLongArrayBinary (Long[] previousKmer){ } } - - /** - * - */ - /* - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable{ - long maxKmerBits= ~((~0L) << (2*param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s){ - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - */ - /** * * @param param diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameDecompresser.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameDecompresser.java index fe13473..578089e 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameDecompresser.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameDecompresser.java @@ -237,10 +237,7 @@ public void assembly() throws IOException { } if (param.interleavedSwitch) { - // FastqRDD =FastqDSLine.toJavaRDD(); - // FastqRDD.saveAsTextFile(param.outputPath + "/Read_Interleaved", FourMcCodec.class); - // FastqDS = spark.createDataset(FastqRDD.rdd(), Encoders.STRING()); DSFastqFilterWithQual DSFastqFilterToFastq = new DSFastqFilterWithQual(); FastqDS = FastqDS.map(DSFastqFilterToFastq, Encoders.STRING()); @@ -254,14 +251,7 @@ public void assembly() throws IOException { DSJavaPipe myPipe = new DSJavaPipe(); MergedSeq=FastqDS.toJavaRDD().mapPartitions(myPipe); - /* - if (param.mode.equals("local")) { - param.executable = param.executable + "/flash"; - MergedSeq = FastqDS.toJavaRDD().pipe(param.executable + " -t 1 --tab-delimited-input --tab-delimited-output --allow-outies --max-overlap 85 -c /dev/stdin");// |awk '{if ($4){print $2\"\\n\"$4}else{print $2}}'"); - }else{ - MergedSeq = FastqDS.toJavaRDD().pipe("./flash -t 1 --tab-delimited-input --tab-delimited-output --allow-outies --max-overlap 85 -c /dev/stdin"); - } -*/ + DSFlashOutputToSeq FlashMergedTabToSeq = new DSFlashOutputToSeq(); MergedSeq= MergedSeq.mapPartitions(FlashMergedTabToSeq); @@ -273,8 +263,6 @@ public void assembly() throws IOException { } if (param.inputPairedSwitch){ - // FastqRDD =FastqDSLine.toJavaRDD(); - // FastqRDD.saveAsTextFile(param.outputPath + "/Read_Paired", FourMcCodec.class); StructType ReadStringStruct = new StructType(); ReadStringStruct = ReadStringStruct.add("ID", DataTypes.StringType, false); @@ -304,24 +292,13 @@ public void assembly() throws IOException { FastqDS = FastqDS.mapPartitions(FastqToTab, Encoders.STRING()); JavaRDD MergedSeq; - /* - if (param.mode.equals("local")) { - param.executable = param.executable + "/flash"; - MergedSeq = FastqDS.toJavaRDD().pipe(param.executable + " -t 1 --tab-delimited-input --tab-delimited-output --allow-outies --max-overlap 85 -c /dev/stdin");// |awk '{if ($4){print $2\"\\n\"$4}else{print $2}}'"); - }else{ - MergedSeq = FastqDS.toJavaRDD().pipe("./flash -t 1 --tab-delimited-input --tab-delimited-output --allow-outies --max-overlap 85 -c /dev/stdin"); - } -*/ + DSJavaPipe myPipe = new DSJavaPipe(); MergedSeq=FastqDS.toJavaRDD().mapPartitions(myPipe); DSFlashOutputToSeq FlashMergedTabToSeq = new DSFlashOutputToSeq(); MergedSeq= MergedSeq.mapPartitions(FlashMergedTabToSeq); -/* - if (param.partitions > 0) { - MergedSeq = MergedSeq.repartition(param.partitions); - } -*/ + MergedSeq.saveAsTextFile(param.outputPath + "/Read_Paired_Merged", FourMcCodec.class); } @@ -432,201 +409,6 @@ public String call(String s) { } } - /** - * - */ - class DSFastqFilterOnlySeq implements MapPartitionsFunction, Serializable{ - ArrayList seqArray = new ArrayList(); - //String line; - //int lineMark = 0; - - public Iterator call(Iterator sIterator) { - while (sIterator.hasNext()) { - String s = sIterator.next(); - if (s.length()<= 20) { - continue; - } else if (s.startsWith("@")) { - continue; - } else if (s.startsWith("+")) { - continue; - } else if (!checkSeq(s.charAt(0))) { - continue; - } else if (!checkSeq(s.charAt(4))){ - continue; - } else if (!checkSeq(s.charAt(9))){ - continue; - } else if (!checkSeq(s.charAt(14))){ - continue; - } else if (!checkSeq(s.charAt(19))){ - continue; - } else { - seqArray.add(s); - } - } - - return seqArray.iterator(); - } - - private boolean checkSeq(char a){ - int match =0; - if (a=='A'){ - match++; - }else if (a=='T'){ - match++; - }else if (a=='C'){ - match++; - }else if (a=='G'){ - match++; - }else if (a=='N'){ - match++; - } - - if (match >0){ - return true; - }else{ - return false; - } - } - - /* - public Iterator call(Iterator sIterator) { - while (sIterator.hasNext()) { - String s = sIterator.next(); - if (lineMark == 2) { - lineMark++; - } else if (lineMark == 3) { - lineMark++; - seqArray.add(line); - } else if (s.startsWith("@")) { - lineMark = 1; - } else if (lineMark == 1) { - line = s; - lineMark++; - } - } - - return seqArray.iterator(); - } - */ - -/* - public String call(String s) { - if (lineMark == 2) { - lineMark++; - return null; - } else if (lineMark == 3) { - lineMark++; - return line; - } else if (s.startsWith("@")) { - lineMark = 1; - return null; - } else if (lineMark == 1) { - line = s; - lineMark++; - return null; - }else{ - return null; - } - } - */ - } - - class FirstNFastq implements MapPartitionsFunction, Serializable{ - List fastLines = new ArrayList(); - String line; - int readcount=0; - int lineMark=0; - - public Iterator call(Iterator s) { - while (s.hasNext() && readcount <= param.readLimit) { - line = s.next(); - fastLines.add(line); - readcount++; - } - //System.out.println("how many times did you went through me"); - return fastLines.iterator(); - } - } - - - /** - * - */ - class ReadBinarizer implements MapPartitionsFunction, Serializable{ - List kmerList = new ArrayList(); - String units; - String kmer; - int currentKmerSize; - int currentKmerBlockSize; - int currentSubKmerSize; - int currentSubKmerBlockSize; - char nucleotide; - long nucleotideInt; - // Long suffixBinary; - // Long[] suffixBinaryArray; - - public Iterator call(Iterator s) { - - while (s.hasNext()) { - - units = s.next(); - - kmer = units.split("\\n")[1]; - - if (kmer.startsWith("(")) { - kmer = kmer.substring(1); - } - - currentKmerSize= kmer.length(); - currentSubKmerSize = currentKmerSize-1; - currentKmerBlockSize = (currentKmerSize-1)/31+1; // each 31 mer is a block - currentSubKmerBlockSize = (currentSubKmerSize-1)/31+1; - - long[] nucleotideBinarySlot = new long[currentKmerBlockSize]; - // Long nucleotideBinary = 0L; - - for (int i = 0; i < currentSubKmerSize; i++) { - nucleotide = kmer.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideInt <<= 2*(32-1-(i%31)); // shift to the left [ATCGGATCC-,ATCGGATCC-] -// nucleotideBinarySlot[i / 31] <<= 2*((32-i)%32); - nucleotideBinarySlot[i / 31] |= nucleotideInt; - - // nucleotideBinary <<= 2; - // nucleotideBinary |= nucleotideInt; - } - - // marking the end of the kmer - long kmerEndMark = 1L; - kmerEndMark <<= 2*(32-1-((currentKmerSize-1)%31+1)); - nucleotideBinarySlot[currentKmerBlockSize-1] |= kmerEndMark; // param.kmerListHash.get(currentKmerSize)] == currentKmerBlockSize - - kmerList.add( - RowFactory.create(nucleotideBinarySlot) - ); - } - - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - - } - class DSInputFastqToTab implements MapPartitionsFunction, Serializable { List reflexivKmerStringList = new ArrayList(); String[] seq; @@ -892,78 +674,6 @@ public Iterator call(Iterator> sIterator) thr } } - class DSBinaryReadToString implements MapPartitionsFunction, Serializable{ - List reflexivKmerStringList = new ArrayList(); - long[] subKmerArray; - - public Iterator call(Iterator sIterator) throws Exception { - while (sIterator.hasNext()) { - String subKmer = ""; - - Row s = sIterator.next(); - - subKmerArray= seq2array(s.getSeq(0)); - - subKmer = BinaryBlocksToString(subKmerArray); - - reflexivKmerStringList.add( - subKmer - ); - } - return reflexivKmerStringList.iterator(); - } - - private char BinaryToNucleotide(Long twoBits) { - char nucleotide; - if (twoBits == 0) { - nucleotide = 'A'; - } else if (twoBits == 1) { - nucleotide = 'C'; - } else if (twoBits == 2) { - nucleotide = 'G'; - } else { - nucleotide = 'T'; - } - return nucleotide; - } - - private String BinaryBlocksToString (long[] binaryBlocks){ - // String KmerString=""; - int KmerLength = currentKmerSizeFromBinaryBlockArray(binaryBlocks); - StringBuilder sb= new StringBuilder(); - char currentNucleotide; - - for (int i=0; i< KmerLength; i++){ - Long currentNucleotideBinary = binaryBlocks[i/31] >>> 2 * (32 - (i%31+1)); - currentNucleotideBinary &= 3L; - currentNucleotide = BinaryToNucleotide(currentNucleotideBinary); - sb.append(currentNucleotide); - } - - return sb.toString(); - } - - private int currentKmerSizeFromBinaryBlockArray(long[] binaryBlocks){ - int kmerSize; - int blockSize = binaryBlocks.length; - kmerSize= (blockSize-1) *31; - final int suffix0s = Long.numberOfTrailingZeros(binaryBlocks[blockSize - 1]); // ATCG...01--- - int lastMers = Long.SIZE/2-suffix0s/2-1; - - kmerSize+=lastMers; - return kmerSize; - - } - - private long[] seq2array(Seq a){ - long[] array =new long[a.length()]; - for (int i = 0; i < a.length(); i++) { - array[i] = (Long) a.apply(i); - } - return array; - } - - } /** * * @param param diff --git a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameReAssembleCounter64.java b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameReAssembleCounter64.java index dad62dc..e7f4cdf 100644 --- a/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameReAssembleCounter64.java +++ b/src/main/java/uni/bielefeld/cmg/reflexiv/pipeline/ReflexivDataFrameReAssembleCounter64.java @@ -239,14 +239,6 @@ private char BinaryToNucleotide (Long twoBits){ } } - /** - * - */ - class DSNullFilter implements FilterFunction, Serializable{ - public boolean call(Row s){ - return s != null; - } - } /** * @@ -286,24 +278,6 @@ public String call(String s) { } } - class PrintElement implements MapPartitionsFunction, Serializable{ - List theSameList = new ArrayList(); - - public Iterator call(Iterator s){ - while (s.hasNext()){ - Row sIterator = s.next(); - // Long a = (Long)sIterator.getSeq(0).apply(0); - // Long b = (Long)sIterator.getSeq(0).apply(1); - - // System.out.println(sIterator.getList(0).get(0)); - // System.out.println(sIterator.getInt(1)); - - theSameList.add(sIterator); - } - return theSameList.iterator(); - } - } - class LoadContigFromText implements MapPartitionsFunction, Serializable{ List contigList = new ArrayList(); @@ -1036,85 +1010,6 @@ private Long[] shiftLongArrayBinary (Long[] previousKmer){ } - /** - * - */ - /* - class ReverseComplementKmerBinaryExtractionFromDataset implements MapPartitionsFunction, Serializable{ - long maxKmerBits= ~((~0L) << (2*param.kmerSize)); - - List kmerList = new ArrayList(); - int readLength; - String[] units; - String read; - char nucleotide; - long nucleotideInt; - long nucleotideIntComplement; - - public Iterator call(Iterator s){ - - while (s.hasNext()) { - units = s.next().split("\\n"); - read = units[1]; - readLength = read.length(); - - if (readLength - param.kmerSize - param.endClip <= 1 || param.frontClip > readLength) { - continue; - } - - Long nucleotideBinary = 0L; - Long nucleotideBinaryReverseComplement = 0L; - - for (int i = param.frontClip; i < readLength - param.endClip; i++) { - nucleotide = read.charAt(i); - if (nucleotide >= 256) nucleotide = 255; - nucleotideInt = nucleotideValue(nucleotide); - // forward kmer in bits - nucleotideBinary <<= 2; - nucleotideBinary |= nucleotideInt; - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinary &= maxKmerBits; - } - - // reverse kmer binarizationalitivities :) non English native speaking people making fun of English - nucleotideIntComplement = nucleotideInt ^ 3; // 3 is binary 11; complement: 11(T) to 00(A), 10(G) to 01(C) - - if (i - param.frontClip >= param.kmerSize) { - nucleotideBinaryReverseComplement >>>= 2; - nucleotideIntComplement <<= 2 * (param.kmerSize - 1); - } else { - nucleotideIntComplement <<= 2 * (i - param.frontClip); - } - nucleotideBinaryReverseComplement |= nucleotideIntComplement; - - // reach the first complete K-mer - if (i - param.frontClip >= param.kmerSize - 1) { - if (nucleotideBinary.compareTo(nucleotideBinaryReverseComplement) < 0) { - kmerList.add(nucleotideBinary); - } else { - kmerList.add(nucleotideBinaryReverseComplement); - } - } - } - } - return kmerList.iterator(); - } - - private long nucleotideValue(char a) { - long value; - if (a == 'A') { - value = 0L; - } else if (a == 'C') { - value = 1L; - } else if (a == 'G') { - value = 2L; - } else { // T - value = 3L; - } - return value; - } - } - */ /** *