From e6e0ab49a0edcced64825d69db7c07b5f1d97b52 Mon Sep 17 00:00:00 2001 From: "[AldhairMedico]" Date: Mon, 9 Dec 2024 23:43:18 -0500 Subject: [PATCH] Dec 09, 2024: hasCanDimer and filterBlocks --- include/input.h | 3 ++- include/teloscope.h | 13 ++++++---- src/input.cpp | 5 ++++ src/main.cpp | 2 +- src/teloscope.cpp | 60 ++++++++++++++++++++++++++++++++++----------- 5 files changed, 62 insertions(+), 21 deletions(-) diff --git a/include/input.h b/include/input.h index 80cb9f0..f84a460 100644 --- a/include/input.h +++ b/include/input.h @@ -15,7 +15,7 @@ struct UserInputTeloscope : UserInput { uint32_t windowSize = 1000; uint8_t kmerLen = 21; uint32_t step = 500; - unsigned short int minBlockLen = 100; // Not used anymore + unsigned short int minBlockLen = 2000; // Only for all blocks unsigned short int maxBlockDist = 200; unsigned short int minBlockCounts = 2; uint32_t terminalLimit = 50000; @@ -29,6 +29,7 @@ struct UserInputTeloscope : UserInput { bool outNonCanMatches = true; bool outITS = true; + // bool outPQonly = true; // bool outDistance = true; // bool outFasta = true; diff --git a/include/teloscope.h b/include/teloscope.h index 48f9b52..505a8cf 100644 --- a/include/teloscope.h +++ b/include/teloscope.h @@ -60,20 +60,22 @@ struct WindowData { float gcContent; float shannonEntropy; // uint32_t winHDistance = 0; - + + std::vector winMatches; std::vector hDistances; uint16_t canonicalCounts = 0; uint16_t nonCanonicalCounts = 0; float canonicalDensity = 0.0f; float nonCanonicalDensity = 0.0f; + bool hasCanDimer = false; - WindowData() : windowStart(0), currentWindowSize(0), gcContent(0.0f), shannonEntropy(0.0f) {} + WindowData() : windowStart(0), currentWindowSize(0), gcContent(0.0f), shannonEntropy(0.0f), hasCanDimer(false) {} }; struct SegmentData { std::vector windows; - std::vector terminalBlocks; - std::vector interstitialBlocks; + // std::vector terminalBlocks; + // std::vector interstitialBlocks; std::unordered_map> mergedBlocks; std::vector canonicalMatches; std::vector nonCanonicalMatches; @@ -135,7 +137,6 @@ class Teloscope { bool walkPath(InPath* path, std::vector &inSegments, std::vector &inGaps); - // void analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData, SegmentData& segmentData); void analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData, SegmentData& segmentData, uint32_t segmentSize); @@ -148,6 +149,8 @@ class Teloscope { std::vector getTelomereBlocks(const std::vector& inputMatches, uint16_t mergeDist); + std::vector filterBlocks(const std::vector& blocks); + void writeBEDFile(std::ofstream& windowMetricsFile, std::ofstream& windowRepeatsFile, std::ofstream& canonicalMatchFile, std::ofstream& noncanonicalMatchFile, std::ofstream& allBlocksFile, std::ofstream& canonicalBlocksFile); diff --git a/src/input.cpp b/src/input.cpp index cd2777e..9efc269 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -137,6 +137,11 @@ bool Teloscope::walkPath(InPath* path, std::vector &inSegments, std: } + // Filter "all" blocks at the path level + if (pathData.mergedBlocks.find("all") != pathData.mergedBlocks.end()) { + pathData.mergedBlocks["all"] = filterBlocks(pathData.mergedBlocks["all"]); + } + std::lock_guard lck(mtx); allPathData.push_back(std::move(pathData)); diff --git a/src/main.cpp b/src/main.cpp index 7d01f6e..f81e1fb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -2,7 +2,7 @@ #include // check -std::string version = "0.0.4"; +std::string version = "0.0.5"; // global std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); diff --git a/src/teloscope.cpp b/src/teloscope.cpp index c1a827b..81823a2 100644 --- a/src/teloscope.cpp +++ b/src/teloscope.cpp @@ -129,11 +129,34 @@ std::vector Teloscope::getTelomereBlocks(const std::vector Teloscope::filterBlocks(const std::vector& blocks) { + std::vector filteredBlocks; + + for (const auto& block : blocks) { + if (block.blockLen >= userInput.minBlockLen) { // Filter by length + filteredBlocks.push_back(block); + } + } + + if (filteredBlocks.size() > 2) { // Keep the two largest blocks + std::sort(filteredBlocks.begin(), filteredBlocks.end(), [](const TelomereBlock& a, const TelomereBlock& b) { + return a.blockLen > b.blockLen; + }); + filteredBlocks.resize(2); + } + + // Ensure the blocks are ordered by their start positions + std::sort(filteredBlocks.begin(), filteredBlocks.end(), [](const TelomereBlock& a, const TelomereBlock& b) { + return a.start < b.start; + }); + + return filteredBlocks; +} + void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData, SegmentData& segmentData, uint32_t segmentSize) { -// void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData, SegmentData& segmentData) { windowData.windowStart = windowStart; // CHECK: Why is this here? unsigned short int longestPatternSize = this->trie.getLongestPatternSize(); @@ -142,6 +165,7 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, // Determine starting index for Trie scanning uint32_t startIndex = (windowStart == 0 || overlapSize == 0) ? 0 : std::min(userInput.step - longestPatternSize, overlapSize - longestPatternSize); + int lastCanonicalPos = -1; for (uint32_t i = startIndex; i < window.size(); ++i) { // Nucleotide iterations @@ -177,39 +201,42 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, std::string pattern = window.substr(i, j - i + 1); bool isCanonical = (pattern == userInput.canonicalPatterns.first || pattern == userInput.canonicalPatterns.second); // Check canonical patterns bool isTerminal = (windowStart + i <= terminalLimit || windowStart + i >= segmentSize - terminalLimit); - uint8_t patternSize = pattern.size(); + float densityGain = static_cast(pattern.size()) / window.size(); + uint32_t matchPos = windowStart + i; // Update windowData from prevOverlapData if (j >= overlapSize || overlapSize == 0 || windowStart == 0) { - float densityGain = static_cast(patternSize) / window.size(); - uint32_t matchPos = windowStart + i; - if (isCanonical) { windowData.canonicalCounts++; windowData.canonicalDensity += densityGain; segmentData.canonicalMatches.push_back(matchPos); + + // Check for canonical dimer + if (!windowData.hasCanDimer && lastCanonicalPos >= 0 && (matchPos - lastCanonicalPos) <= userInput.canonicalSize) { + windowData.hasCanDimer = true; + } + lastCanonicalPos = matchPos; // Update last canonical position + } else { windowData.nonCanonicalCounts++; windowData.nonCanonicalDensity += densityGain; } if (isTerminal) { - segmentData.segMatches.push_back(matchPos); + windowData.winMatches.push_back(matchPos); if (!isCanonical) { segmentData.nonCanonicalMatches.push_back(matchPos); } } - // windowData.hDistances.push_back(userInput.hammingDistances[pattern]); - // windowData.winHDistance += userInput.hammingDistances[pattern]; } // Update nextOverlapData if (i >= userInput.step && overlapSize != 0 ) { if (isCanonical) { nextOverlapData.canonicalCounts++; - nextOverlapData.canonicalDensity += static_cast(patternSize) / window.size(); + nextOverlapData.canonicalDensity += densityGain; } else { nextOverlapData.nonCanonicalCounts++; - nextOverlapData.nonCanonicalDensity += static_cast(patternSize) / window.size(); + nextOverlapData.nonCanonicalDensity += densityGain; } } } @@ -243,7 +270,6 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope while (windowStart < segmentSize) { // Prepare and analyze current window WindowData windowData = prevOverlapData; - // analyzeWindow(window, windowStart, windowData, nextOverlapData, segmentData); analyzeWindow(window, windowStart, windowData, nextOverlapData, segmentData, segmentSize); if (userInput.modeGC) { windowData.gcContent = getGCContent(windowData.nucleotideCounts, window.size()); } @@ -254,10 +280,16 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope windowData.currentWindowSize = currentWindowSize; if (userInput.keepWindowData) { windows.emplace_back(windowData); } // User defines if windowData is stored - // Pass and reset overlap data - prevOverlapData = nextOverlapData; + prevOverlapData = nextOverlapData; // Pass and reset overlap data nextOverlapData = WindowData(); // Reset for next iteration + // Keep all in presence of canonical matches + if (windowData.hasCanDimer) { + segmentData.segMatches.insert(segmentData.segMatches.end(), + windowData.winMatches.begin(), + windowData.winMatches.end()); + } + // Prepare next window windowStart += step; if (windowStart >= segmentSize) { @@ -325,7 +357,7 @@ void Teloscope::writeBEDFile(std::ofstream& windowMetricsFile, std::ofstream& wi if (outputFile) { for (const auto& block : blocks) { uint64_t blockEnd = block.start + block.blockLen; - *outputFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n"; + *outputFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << block.blockLen << "\n"; } } }