diff --git a/Makefile b/Makefile index db5201d..dbab604 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ LDFLAGS := -pthread #gfalibs GFALIBS_DIR := $(CURDIR)/gfalibs -SOURCES := main input graph-builder kreeq subgraph kreeq-output +SOURCES := main input graph-builder kreeq subgraph kreeq-output variants OBJECTS := $(addprefix $(BINDIR)/, $(SOURCES)) head: $(OBJECTS) gfalibs | $(BUILD) diff --git a/include/kreeq.h b/include/kreeq.h index c64e66a..05bed5c 100644 --- a/include/kreeq.h +++ b/include/kreeq.h @@ -209,7 +209,7 @@ class DBG : public Kmap { // void printVCF(); - bool searchGraph(std::array mapRange); + void searchGraph(); std::pair findDBGkmer(uint8_t *origin); @@ -229,6 +229,8 @@ class DBG : public Kmap { // bool DBGtoVariants(InSegment *inSegment); + std::pair> searchVariants(std::pair source, std::array mapRange, phmap::parallel_flat_hash_map &targetsMap, ParallelMap32* localGraphCache); + bool variantsToGFA(InSegment *inSegment, Log &threadLog); void collapseNodes(); diff --git a/include/variants.h b/include/variants.h new file mode 100644 index 0000000..347447f --- /dev/null +++ b/include/variants.h @@ -0,0 +1,5 @@ +#ifndef variants_h +#define variants_h + + +#endif /* variants_h */ diff --git a/src/input.cpp b/src/input.cpp index 80a0ef6..c9fd311 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -167,6 +167,14 @@ void Input::read() { knav.loadGenome(&genome); } knav.subgraph(); + + lg.verbose("Searching graph"); + knav.searchGraph(); + lg.verbose("Remove missing edges"); + knav.removeMissingEdges(); + lg.verbose("Generating GFA"); + knav.DBGgraphToGFA(); + knav.report(); // output knav.cleanup(); // delete tmp files break; diff --git a/src/kreeq.cpp b/src/kreeq.cpp index cebd5ad..2a36f8e 100644 --- a/src/kreeq.cpp +++ b/src/kreeq.cpp @@ -228,113 +228,6 @@ bool DBG::evaluateSegment(uint32_t s, std::array mapRange) { } -void DBG::correctSequences() { - - if (userInput.inSequence.empty()) - return; - - lg.verbose("Generating candidates for correction"); - - std::array mapRange = {0,0}; - - // this section will need to be implemented to manage memory - -// if (computeMapRange(mapRange)[1] < mapCount) { -// -// for (uint8_t i = 0; i < userInput.depth; ++i) { -// -// mapRange = {0,0}; -// -// while (mapRange[1] < mapCount) { -// -// mapRange = computeMapRange(mapRange); -// loadMapRange(mapRange); -// // searchGraph(mapRange); -// deleteMapRange(mapRange); -// -// } -// } -// } - - std::vector> jobs; - mapRange = computeMapRange(mapRange); - loadMapRange(mapRange); - - if (!userInput.inBedInclude.empty() || userInput.pipeType == 'i') { - - StreamObj streamObj; - std::shared_ptr stream = streamObj.openStream(userInput, 'i'); - std::string line, bedHeader; - - while (getline(*stream, line)) { - - uint64_t begin = 0, end = 0; - std::istringstream iss(line); - iss >> bedHeader >> begin >> end; - userInput.bedIncludeList.pushCoordinates(bedHeader, begin, end); - } - lg.verbose("Finished reading BED include list"); - userInput.bedIncludeList = BEDPathsToSegments(); - } - - std::vector inSegments = *genome->getInSegments(); - for (InSegment *inSegment : inSegments) - jobs.push_back([this, inSegment] { return DBGtoVariants(inSegment); }); - - threadPool.queueJobs(jobs); - jobWait(threadPool); - deleteMapRange(mapRange); - -} - -bool DBG::searchGraph(std::array mapRange) { // stub - - ParallelMap* genomeDBG = new ParallelMap; - - std::vector *inSegments = genome->getInSegments(); - - for (InSegment *inSegment : *inSegments) { - - uint64_t len = inSegment->getSegmentLen(); - - if (lengetInSequencePtr()->c_str(); - uint8_t* str = new uint8_t[len]; - - for (uint64_t i = 0; i= mapRange[0] && i < mapRange[1]) { - - map = maps[i]; - auto got = map->find(key); - - // check for DBG consistency - if (got == map->end()) { - genomeDBG->insert(*got); -// DBGkmer &dbgkmer = got->second; - } - } - } - } - delete genomeDBG; - return true; - -} - std::pair DBG::findDBGkmer(uint8_t *origin) { uint64_t key, i; @@ -350,32 +243,6 @@ std::pair DBG::findDBGkmer(uint8_t *origin) { } -int8_t DBG::scorePath(std::string path) { // not tested - - int8_t score = 0; - uint8_t len = path.size(); - unsigned char* first = (unsigned char*)path.c_str(); - - for (uint8_t i = 0; i < len-k; ++i) { - -// std::cout< DBG::findPaths(uint8_t *origin, uint8_t *target, uint8_t depth, DBGpath currentPath, Log &threadLog) { - - uint8_t breadth = 0; - std::deque DBGpaths; - - for (uint8_t a = 0; a <= breadth; ++a) { // to expand the search before and after the current pos - - origin -= a; - - if (depth != 0) { - - auto it = findDBGkmer(origin); - DBGkmer *dbgOrigin = it.first; - bool isFw = it.second; - - if (dbgOrigin == nullptr) - return DBGpaths; - - for (uint8_t i = 0; i < 4 ; ++i){ - - DBGpath newPath = currentPath; - - uint8_t e = 0; - if(isFw) - e = i; - else - e = 3-i; - - if ((isFw && dbgOrigin->fw[e] != 0) || (!isFw && dbgOrigin->bw[e] != 0)) { - -// threadLog.add(std::to_string(i) +":"+ std::to_string(depth) + ":" + std::to_string(*(target+1))); - - if(depth == 3 && i == *(target+1)) { - newPath.score += checkNext(origin, target+1); - threadLog.add("found INS (score: " + std::to_string(newPath.score) + ")\t" + newPath.sequence); - DBGpaths.push_back(DBGpath(INS, newPath.pos, newPath.sequence.substr(0, newPath.sequence.size()-1), newPath.score)); - newPath = currentPath; - } - - uint8_t nextKmer[k]; - memcpy(nextKmer, origin+1, k-1); - nextKmer[k-1] = i; - - if(depth == 2 && i == *target) { - newPath.score += checkNext(nextKmer, target+1); - threadLog.add("found DEL (score: " + std::to_string(newPath.score) + ")\t" + newPath.sequence); - newPath.type = DEL; - DBGpaths.push_back(newPath); - newPath = currentPath; - } - - if (depth == 2 && i == *(target+1)) { - newPath.score += checkNext(nextKmer, target+2); - threadLog.add("found SNV (score: " + std::to_string(newPath.score) + ")\t" + newPath.sequence); - DBGpaths.push_back(DBGpath(SNV, newPath.pos, newPath.sequence.substr(0, newPath.sequence.size()), newPath.score)); - newPath = currentPath; - } - - newPath.sequence+=itoc[i]; - -// if (DBGpaths.size() > 2) // limit the number of paths to avoid extensive search -// return DBGpaths; - - std::deque newDBGpaths = findPaths(nextKmer, target, depth-1, newPath, threadLog); - DBGpaths.insert(DBGpaths.end(), newDBGpaths.begin(), newDBGpaths.end()); - } - } - } - } -// if (DBGpaths.size() > 2) // limit the number of paths to avoid extensive search -// DBGpaths.clear(); - return DBGpaths; - -} - -void DBG::printAltPaths(std::vector> altPaths, Log &threadLog) { - - uint8_t pathCounter = 1; - - for (std::vector altPath : altPaths) { - - std::string logMsg = "P" + std::to_string(pathCounter++) + ":"; - - for (uint8_t base : altPath) - logMsg += itoc[base]; - threadLog.add(logMsg += "\n"); - } -} - bool DBG::loadSegmentCoordinates(InSegment *inSegment, std::vector &segmentCoordinates) { auto coordinates = userInput.bedIncludeList.getCoordinates(); @@ -554,315 +332,7 @@ BedCoordinates DBG::BEDPathsToSegments() { // project path coordinates to segmen return inBedIncludeSegments; } -bool DBG::detectAnomalies(InSegment *inSegment, std::vector &anomalies) { - - Log threadLog; - threadLog.setId(inSegment->getuId()); - - std::string sHeader = inSegment->getSeqHeader(); - ParallelMap *map; - uint64_t key, i; - bool isFw = false, anomaly = false; - - uint64_t len = inSegment->getSegmentLen(); - - if (lengetInSequencePtr()->c_str(); - uint8_t* str = new uint8_t[len]; - - for (uint64_t i = 0; ifind(key); - - // check for DBG consistency - if (got != map->end()) { - DBGkmer &dbgkmer = got->second; - if (c < kcount-1 && ((isFw && dbgkmer.fw[*(str+c+k)] == 0) || (!isFw && dbgkmer.bw[3-*(str+c+k)] == 0))) // find alternative paths - anomaly = true; - }else{anomaly = true;} - if (anomaly) { - anomalies.push_back(c+k); - threadLog.add("Anomaly at:\t" + sHeader + "\t" + std::to_string(c+k+1)); - anomaly = false; - } - } - delete[] str; - - std::unique_lock lck(mtx); - logs.push_back(threadLog); - - return true; - -} - -bool DBG::DBGtoVariants(InSegment *inSegment) { - - Log threadLog; - threadLog.setId(inSegment->getuId()); - - std::string sHeader = inSegment->getSeqHeader(); - ParallelMap *map; - uint64_t key, i; - bool isFw = false; - std::vector> variants; - std::vector anomalies; - userInput.inBedInclude == "" ? detectAnomalies(inSegment, anomalies) : loadSegmentCoordinates(inSegment, anomalies); - uint64_t len = inSegment->getSegmentLen(); - - if (lengetInSequencePtr()->c_str(); - uint8_t* str = new uint8_t[len]; - - for (uint64_t i = 0; i> altPaths; - - StringGraph *stringGraph = new StringGraph(str, k, anomalies[0]-k); - - for(std::vector::iterator anomaly = anomalies.begin(); anomaly < anomalies.end(); ++anomaly) { - - std::deque DBGpaths; - altPaths = stringGraph->walkStringGraph(stringGraph->root, std::vector()); -// printAltPaths(altPaths, threadLog); - - if (altPaths.size() < 2) { - stringGraph->deleteStringGraph(stringGraph->root); -// delete stringGraph; - stringGraph = new StringGraph(str, k, *anomaly-k); - altPaths = stringGraph->walkStringGraph(stringGraph->root, std::vector()); - } - - bool backtrack = true; - - for (std::vector altPath : altPaths) { - key = hash(&altPath[0], &isFw); - i = key % mapCount; - - map = maps[i]; - auto got = map->find(key); - - // check for DBG consistency - if (got != map->end()) { - - DBGkmer &dbgkmer = got->second; - if (stringGraph->currentPos() < kcount-1 && ((isFw && dbgkmer.fw[altPath[k]] == 0) || (!isFw && dbgkmer.bw[3-altPath[k]] == 0))) { // find alternative paths - - double score = - checkNext(&altPath[0], &altPath[k]); - - threadLog.add(std::to_string(altPath[0]) + "," + std::to_string(altPath[k]) + "," + std::to_string(score)); - - std::deque newDBGpaths = findPaths(&altPath[0], &altPath[k], userInput.depth, DBGpath(stringGraph->currentPos(), score), threadLog); - - DBGpaths.insert(DBGpaths.end(), newDBGpaths.begin(), newDBGpaths.end()); - threadLog.add("Found " + std::to_string(DBGpaths.size()) + " alternative paths"); - // if (DBGpaths.size() > 2) { // only attempt to correct unique paths - // DBGpaths.clear(); - // break; - // } - - }else{backtrack = false;} - }else{backtrack = false;} - } - - if (DBGpaths.size() > 1) { - std::sort(DBGpaths.begin(), DBGpaths.end(), [](const DBGpath& v1, const DBGpath& v2) {return v1.score > v2.score;}); - DBGpaths = std::deque(DBGpaths.begin(), DBGpaths.begin()+1); // get at most two branches in the search tree - } - - uint8_t backtrackCnt = 0; - - if (DBGpaths.size() == 0 && backtrack) { // backtrack - - threadLog.add("Anomaly detected but no path is found. Backtracking"); - - for (uint8_t b = 0; b < userInput.backtrackingSpan; ++b) { - - ++backtrackCnt; - - if (variants.size() == 0 || stringGraph->currentPos()-backtrackCnt > variants.back()[0].pos) { // prevent going past the previously discovered variant - - threadLog.add("Testing position:\t" + sHeader + "\t" + std::to_string(stringGraph->currentPos())); - stringGraph->backtrack(str, k, 1); - altPaths = stringGraph->walkStringGraph(stringGraph->root, std::vector()); - std::vector altPath = altPaths[0]; - // printAltPaths(altPaths, threadLog); - double score = - checkNext(&altPath[0], &altPath[k]); - - std::deque newDBGpaths = findPaths(&altPath[0], &altPath[k], userInput.depth, DBGpath(stringGraph->currentPos(), score), threadLog); - - DBGpaths.insert(DBGpaths.end(), newDBGpaths.begin(), newDBGpaths.end()); - } - } - - if (DBGpaths.size() > 1) { - std::sort(DBGpaths.begin(), DBGpaths.end(), [](const DBGpath& v1, const DBGpath& v2) {return v1.score > v2.score;}); - DBGpaths = std::deque(DBGpaths.begin(), DBGpaths.begin()+1); // get at most two branches in the search tree - } - } - - if (DBGpaths.size() != 0) { - - threadLog.add("Candidate error at:\t" + sHeader + "\t" + std::to_string(stringGraph->currentPos()+1)); - std::vector alts; - - for (DBGpath dbgpath : DBGpaths) { - - if (dbgpath.type == SNV) { - alts.push_back(stringGraph->peek()); - alts.push_back(ctoi[(unsigned char)dbgpath.sequence[0]]); - }else if (dbgpath.type == INS) { - alts.push_back(stringGraph->peek()); - alts.push_back(4); - }else if (dbgpath.type == DEL) { - alts.push_back(ctoi[(unsigned char)dbgpath.sequence[0]]); - } - } - stringGraph->appendAlts(alts); - variants.push_back(DBGpaths); - } - if (backtrack) - stringGraph->advancePos(backtrackCnt); - if (std::next(anomaly) != anomalies.end()) { - if (*std::next(anomaly) < (*anomaly)+k) { - stringGraph->appendNext(*std::next(anomaly) - *anomaly); - for (uint8_t e = 0; e < *std::next(anomaly) - *anomaly; ++e) - stringGraph->pop_front(); - } - }else{ - delete stringGraph; - } - } - delete[] str; - - inSegment->addVariants(variants); - - std::string ext = getFileExt("." + userInput.outFile); - if (ext == "gfa" || ext == "gfa2" || ext == "gfa.gz" || ext == "gfa2.gz") - variantsToGFA(inSegment, threadLog); - - std::unique_lock lck (mtx); - logs.push_back(threadLog); - - return true; -} - -bool DBG::variantsToGFA(InSegment *inSegment, Log &threadLog) { - - uint64_t processed = 0; - uint32_t segmentCounter = 0, edgeCounter = 0, sUId, sUIdNew; - std::string *oldSequence = inSegment->getInSequencePtr(); - std::string sHeader = inSegment->getSeqHeader(); - std::string* inSequence; - Sequence* newSequence; - InSegment* newSegment; - std::vector sUIds; - uint32_t seqPos = inSegment->getSeqPos(), sId = 0, eId = 0; - std::vector>& variants = inSegment->getVariants(); - - for (std::deque DBGpaths : variants) { - - threadLog.add("Introducing variants at pos: " + std::to_string(DBGpaths[0].pos+1)); - - inSequence = new std::string(oldSequence->substr(processed, DBGpaths[0].pos-processed)); - newSequence = new Sequence{sHeader + "." + std::to_string(++segmentCounter), "", inSequence, NULL, seqPos}; - threadLog.add("Previous sequence: " + newSequence->header); - newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); - sUId = newSegment->getuId(); - - for (uint32_t sUIdprev : sUIds) { - InEdge edge(seqPos, eId++, sUIdprev, sUId, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); - genome->appendEdge(edge); - } - sUIds.clear(); - - uint32_t altCounter = 0; - bool originalAdded = false; - processed = DBGpaths[0].pos; - - for (DBGpath variant : DBGpaths) { - - if (variant.type != DEL && !originalAdded) { - - inSequence = new std::string(oldSequence->substr(DBGpaths[0].pos, 1)); - newSequence = new Sequence{sHeader + "." + std::to_string(++segmentCounter), "Candidate sequence", inSequence, NULL, seqPos}; - threadLog.add("Old segment from SNV/DEL error: " + newSequence->header + "\t" + *inSequence); - newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); - sUIdNew = newSegment->getuId(); - sUIds.push_back(sUIdNew); - - InEdge edge(seqPos, eId++, sUId, sUIdNew, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); - genome->appendEdge(edge); - - originalAdded = true; - ++processed; - - } - - if (variant.type == SNV || variant.type == DEL) { - - inSequence = new std::string(variant.sequence); - newSequence = new Sequence{sHeader + "." + std::to_string(segmentCounter) + ".alt" + std::to_string(++altCounter), "Candidate sequence", inSequence, NULL, seqPos}; - threadLog.add("New segment from SNV error: " + newSequence->header + "\t" + *inSequence); - newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); - sUIdNew = newSegment->getuId(); - sUIds.push_back(sUIdNew); - - } - - if (variant.type == SNV) { - - InEdge edge(seqPos, eId++, sUId, sUIdNew, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); - genome->appendEdge(edge); - - }else if (variant.type == INS) { - - sUIds.push_back(sUId); - - }else if (variant.type == DEL) { - - InEdge edge(seqPos, eId++, sUId, sUIdNew, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); - genome->appendEdge(edge); - sUIds.push_back(sUId); - } - } - } - - if (variants.size() > 0) { // residual sequence - - inSequence = new std::string(oldSequence->substr(processed)); - newSequence = new Sequence{sHeader + "." + std::to_string(++segmentCounter), "", inSequence, NULL, seqPos}; - threadLog.add("Previous sequence: " + newSequence->header); - newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); - sUId = newSegment->getuId(); - - for (uint32_t sUIdprev : sUIds) { - InEdge edge(seqPos, eId++, sUIdprev, sUId, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); - genome->appendEdge(edge); - } - - genome->deleteSegment(sHeader); - - } - - return true; - -} std::string colorPalette(uint8_t value){ @@ -978,6 +448,7 @@ void DBG::collapseNodes() { residualEdges[pair->first] = std::make_tuple(pair->second,idCounter,direction); // we preserve the edge } } + DBGsubgraph->erase(pair->first); // std::cout<<*sequence->sequence<first] = std::make_tuple(pair->second,idCounter,0); @@ -987,8 +458,6 @@ void DBG::collapseNodes() { std::vector inTags = {Tag{'f',"DP",std::to_string(pair->second.cov)},Tag{'Z',"CB",colorPalette(pair->second.color)}}; sequence->seqPos = seqPos++; // remember the order GFAsubgraph.appendSegment(sequence, inTags); - - DBGsubgraph->erase(pair->first); } jobWait(threadPool); while (residualEdges.size() != 0) { // construct the edges diff --git a/src/variants.cpp b/src/variants.cpp new file mode 100644 index 0000000..4051e2d --- /dev/null +++ b/src/variants.cpp @@ -0,0 +1,432 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "parallel-hashmap/phmap.h" +#include "parallel-hashmap/phmap_dump.h" + +#include "bed.h" +#include "struct.h" +#include "functions.h" +#include "global.h" +#include "uid-generator.h" +#include "gfa-lines.h" +#include "gfa.h" +#include "stream-obj.h" +#include "output.h" +#include "string-graph.h" + +#include "input.h" +#include "kmer.h" +#include "kreeq.h" +#include "fibonacci-heap.h" + +#include "variants.h" + +void DBG::correctSequences() { + + if (userInput.inSequence.empty()) + return; + + std::vector inSegments = *genome->getInSegments(); + for (InSegment *inSegment : inSegments) + DBGtoVariants(inSegment); +} + +bool DBG::DBGtoVariants(InSegment *inSegment) { + + Log threadLog; + + uint64_t explored = 0, len = inSegment->getSegmentLen(); + + if (len mapRange; + std::vector> variants; + + unsigned char* first = (unsigned char*)inSegment->getInSequencePtr()->c_str(); // prepare segment + uint8_t* str = new uint8_t[len]; + bool* visited = new bool[len]{false}; + + for (uint64_t i = 0; i targetsQueue; + phmap::parallel_flat_hash_map targetsMap; + + while (mapRange[1] < mapCount) { + + mapRange = computeMapRange(mapRange); + loadMapRange(mapRange); + + uint64_t key, i; + ParallelMap *map; + ParallelMap32 *map32; + bool isFw = false; + + for (uint16_t pos = 0; pos < userInput.maxSpan; ++pos) { // populate targets + targetsQueue.push(hash(str+pos)); + targetsMap[hash(str+pos)]; + } + + for (uint64_t c = 0; c= mapRange[0] && i < mapRange[1]) { + + map = maps[i]; + auto it = map->find(key); + std::pair pair; + if (it != map->end()) { + + if (pair.second.cov == 255) { + map32 = maps32[i]; + auto it = map32->find(key); + if (it == map32->end()) { + std::cerr<<"Error: int32 map missing 255 value from int8 map"<addVariants(variants); + + std::string ext = getFileExt("." + userInput.outFile); + if (ext == "gfa" || ext == "gfa2" || ext == "gfa.gz" || ext == "gfa2.gz") + variantsToGFA(inSegment, threadLog); + + std::unique_lock lck(mtx); + logs.push_back(threadLog); + + return true; +} + +std::pair> DBG::searchVariants(std::pair source, std::array mapRange, phmap::parallel_flat_hash_map &targetsMap, ParallelMap32* localGraphCache) { // dijkstra variant search + + bool explored = false; // true if we reached a node in the original graph + std::vector destinations; + FibonacciHeap*> Q; // node priority queue Q + phmap::parallel_flat_hash_map dist; + phmap::parallel_flat_hash_map> prev; // distance table + std::deque discoveredPaths; + + dist[source.first] = 1; + Q.insert(&source, 1); // associated priority equals dist[ยท] + + uint64_t key; + int16_t depth = 0; + bool direction = true, isFw; + + while (Q.size() > 0 && depth < userInput.kmerDepth + 1) { // the main loop + explored = false; // if there are still node in the queue we cannot be done + ParallelMap *map; + // ParallelMap32 *map32; + + std::pair* u = Q.extractMin(); // remove and return best vertex + auto got = prev.find(u->first); // check direction + if (got != prev.end()) { + direction = got->second.second; + } + + auto checkNext = [&,this] (uint64_t key, bool direction) { + auto startNode = targetsMap.find(key); + if (startNode == targetsMap.end()) { // if we connect to the original graph we are done + auto nextKmer = localGraphCache->find(key); // check if the node is in the cache (already visited) + + if (nextKmer == localGraphCache->end()) { // we cached this node before + uint64_t m = key % mapCount; + if (m >= mapRange[0] && m < mapRange[1]) { // the node is in not cached but is available to visit now + map = maps[m]; + auto got = map->find(key); + nextKmer = localGraphCache->insert(*got).first; // cache node for future iterations + }else{ + return false; + } + } + uint8_t alt = dist[u->first]; // g(n) + if (alt < std::numeric_limits::max()) + alt += 1; // Graph.Edges(u, v) << actual weight g(h) + auto got = dist.find(nextKmer->first); + if (got == dist.end()) { // if the next node was not seen before we add it to the queue + dist[nextKmer->first] = std::numeric_limits::max(); // unknown distance from source to v + Q.insert(&*nextKmer, 0); + } + if (alt < dist[nextKmer->first]) { + prev[nextKmer->first] = std::make_pair(u->first,direction); + dist[nextKmer->first] = alt; + Q.decreaseKey(&*nextKmer, alt); + } + } + return true; + }; + uint8_t edgeCount = 0, exploredCount = 0; + for (uint8_t i = 0; i<4; ++i) { + + if (direction || depth == 0) { // forward edges + + if (depth == 0) + direction = true; + + if (u->second.fw[i] > userInput.covCutOff) { + uint8_t nextKmer[k]; + buildNextKmer(nextKmer, u->first, i, true); // compute next node + key = hash(nextKmer, &isFw); + bool found = checkNext(key, isFw ? direction : !direction); + if (found) { + ++exploredCount; + if (targetsMap.find(key) != targetsMap.end()) + destinations.push_back(u->first); + } + ++edgeCount; + } + } + if (!direction || depth == 0) { + + if (depth == 0) + direction = false; + + if (u->second.bw[i] > userInput.covCutOff) { // backward edges + uint8_t nextKmer[k]; + buildNextKmer(nextKmer, u->first, i, false); // compute next node + key = hash(nextKmer, &isFw); + bool found = checkNext(key, isFw ? direction : !direction); + if (found) { + ++exploredCount; + if (targetsMap.find(key) != targetsMap.end()) + destinations.push_back(u->first); + } + ++edgeCount; + } + } + } + depth += 1; + + if(edgeCount == exploredCount || depth == userInput.kmerDepth + 1 || destinations.size() >= 10) // everything explored/found, depth reached, or top10 + explored = true; + } + if (destinations.size() > 1) { // traverse from target to source, the first path is the reference + for (uint64_t destination : destinations) { + DBGpath newPath; + newPath.sequence = reverseHash(destination); + while (destination != source.first) { // construct the shortest path with a stack S + + newPath.sequence.push_back(reverseHash(destination)[k-1]); + + dist.erase(destination); + destination = prev[destination].first; + } + discoveredPaths.push_back(newPath); + } + } + if (explored) { // we exahusted the search + for (auto node : dist) // clear the cache for this source + localGraphCache->erase(node.first); + } + return std::make_pair(explored,discoveredPaths); +} + +bool DBG::variantsToGFA(InSegment *inSegment, Log &threadLog) { + + uint64_t processed = 0; + uint32_t segmentCounter = 0, edgeCounter = 0, sUId, sUIdNew; + std::string *oldSequence = inSegment->getInSequencePtr(); + std::string sHeader = inSegment->getSeqHeader(); + std::string* inSequence; + Sequence* newSequence; + InSegment* newSegment; + std::vector sUIds; + uint32_t seqPos = inSegment->getSeqPos(), sId = 0, eId = 0; + std::vector>& variants = inSegment->getVariants(); + + for (std::deque DBGpaths : variants) { + + threadLog.add("Introducing variants at pos: " + std::to_string(DBGpaths[0].pos+1)); + + inSequence = new std::string(oldSequence->substr(processed, DBGpaths[0].pos-processed)); + newSequence = new Sequence{sHeader + "." + std::to_string(++segmentCounter), "", inSequence, NULL, seqPos}; + threadLog.add("Previous sequence: " + newSequence->header); + newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); + sUId = newSegment->getuId(); + + for (uint32_t sUIdprev : sUIds) { + InEdge edge(seqPos, eId++, sUIdprev, sUId, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); + genome->appendEdge(edge); + } + sUIds.clear(); + + uint32_t altCounter = 0; + bool originalAdded = false; + processed = DBGpaths[0].pos; + + for (DBGpath variant : DBGpaths) { + + if (variant.type != DEL && !originalAdded) { + + inSequence = new std::string(oldSequence->substr(DBGpaths[0].pos, 1)); + newSequence = new Sequence{sHeader + "." + std::to_string(++segmentCounter), "Candidate sequence", inSequence, NULL, seqPos}; + threadLog.add("Old segment from SNV/DEL error: " + newSequence->header + "\t" + *inSequence); + newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); + sUIdNew = newSegment->getuId(); + sUIds.push_back(sUIdNew); + + InEdge edge(seqPos, eId++, sUId, sUIdNew, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); + genome->appendEdge(edge); + + originalAdded = true; + ++processed; + + } + if (variant.type == SNV || variant.type == DEL) { + + inSequence = new std::string(variant.sequence); + newSequence = new Sequence{sHeader + "." + std::to_string(segmentCounter) + ".alt" + std::to_string(++altCounter), "Candidate sequence", inSequence, NULL, seqPos}; + threadLog.add("New segment from SNV error: " + newSequence->header + "\t" + *inSequence); + newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); + sUIdNew = newSegment->getuId(); + sUIds.push_back(sUIdNew); + } + if (variant.type == SNV) { + + InEdge edge(seqPos, eId++, sUId, sUIdNew, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); + genome->appendEdge(edge); + + }else if (variant.type == INS) { + + sUIds.push_back(sUId); + + }else if (variant.type == DEL) { + + InEdge edge(seqPos, eId++, sUId, sUIdNew, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); + genome->appendEdge(edge); + sUIds.push_back(sUId); + } + } + } + if (variants.size() > 0) { // residual sequence + + inSequence = new std::string(oldSequence->substr(processed)); + newSequence = new Sequence{sHeader + "." + std::to_string(++segmentCounter), "", inSequence, NULL, seqPos}; + threadLog.add("Previous sequence: " + newSequence->header); + newSegment = genome->traverseInSegment(newSequence, std::vector(), sId++); + sUId = newSegment->getuId(); + + for (uint32_t sUIdprev : sUIds) { + InEdge edge(seqPos, eId++, sUIdprev, sUId, '+', '+', "0M", sHeader + ".edge." + std::to_string(++edgeCounter)); + genome->appendEdge(edge); + } + genome->deleteSegment(sHeader); + } + return true; +} + +bool DBG::detectAnomalies(InSegment *inSegment, std::vector &anomalies) { // find disconnected kmers in segments + + Log threadLog; + threadLog.setId(inSegment->getuId()); + + std::string sHeader = inSegment->getSeqHeader(); + ParallelMap *map; + uint64_t key, i; + bool isFw = false, anomaly = false; + + uint64_t len = inSegment->getSegmentLen(); + + if (lengetInSequencePtr()->c_str(); + uint8_t* str = new uint8_t[len]; + + for (uint64_t i = 0; ifind(key); + + // check for DBG consistency + if (got != map->end()) { + DBGkmer &dbgkmer = got->second; + if (c < kcount-1 && ((isFw && dbgkmer.fw[*(str+c+k)] == 0) || (!isFw && dbgkmer.bw[3-*(str+c+k)] == 0))) // find alternative paths + anomaly = true; + }else{anomaly = true;} + if (anomaly) { + anomalies.push_back(c+k); + threadLog.add("Anomaly at:\t" + sHeader + "\t" + std::to_string(c+k+1)); + anomaly = false; + } + } + delete[] str; + + std::unique_lock lck(mtx); + logs.push_back(threadLog); + + return true; + +}