diff --git a/include/input.h b/include/input.h index aa0258a..e3c189a 100644 --- a/include/input.h +++ b/include/input.h @@ -6,20 +6,20 @@ struct UserInputTeloscope : UserInput { std::string outRoute; + std::pair canonicalPatterns; std::vector patterns; uint32_t windowSize = 1000; uint8_t kmerLen = 21; uint32_t step = 500; - std::pair canonicalPatterns; + unsigned short int minBlockLen = 24; + unsigned short int minBlockDist = 5; + unsigned short int maxBlockDist = 100; - double maxMem = 0; - std::string prefix = ".", outFile = ""; bool keepWindowData = false; // Memory intensive bool modeMatch = true, modeEntropy = true, modeGC = true; // Change to: de novo, user-defined - float densityThreshold = 0.5f; // Threshold for telomere detection - uint32_t maxGaps = 2; // Allowed gaps in telomere blocks - uint8_t blockDistance = 6; - + + double maxMem = 0; + std::string prefix = ".", outFile = ""; }; class Input { diff --git a/include/teloscope.h b/include/teloscope.h index 1b23cc3..688a0f3 100644 --- a/include/teloscope.h +++ b/include/teloscope.h @@ -134,10 +134,12 @@ class Teloscope { std::vector mergeTelomereBlocks(const std::vector& winBlocks); void writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile, - std::unordered_map& patternMatchFiles, - std::unordered_map& patternCountFiles, - std::unordered_map& patternDensityFiles, - std::ofstream& telomereBlocksFile); + std::unordered_map& patternMatchFiles, + std::unordered_map& patternCountFiles, + std::unordered_map& patternDensityFiles, + std::ofstream& allBlocksFile, + std::ofstream& canonicalBlocksFile, + std::ofstream& noncanonicalBlocksFile); void handleBEDFile(); diff --git a/src/main.cpp b/src/main.cpp index 2badc31..e7ab020 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -48,7 +48,9 @@ int main(int argc, char **argv) { {"threads", required_argument, 0, 'j'}, {"keep-window-data", no_argument, 0, 'k'}, {"mode", required_argument, 0, 'm'}, - + {"min-block-length", required_argument, 0, 'l'}, + {"max-block-distance", required_argument, 0, 'd'}, + {"verbose", no_argument, &verbose_flag, 1}, {"cmd", no_argument, &cmd_flag, 1}, {"version", no_argument, 0, 'v'}, @@ -61,7 +63,7 @@ int main(int argc, char **argv) { int option_index = 0; - c = getopt_long(argc, argv, "-:f:j:m:o:p:s:w:c:kvh", long_options, &option_index); + c = getopt_long(argc, argv, "-:f:j:m:o:p:s:w:c:l:d:kvh", long_options, &option_index); // if (optind < argc && !isPipe) { // if pipe wasn't assigned already @@ -158,38 +160,6 @@ int main(int argc, char **argv) { break; - // case 'p': - // { - // std::istringstream patternStream(optarg); - // std::string pattern; - - // while (std::getline(patternStream, pattern, ',')) { - // if (pattern.empty()) continue; - - // if (std::any_of(pattern.begin(), pattern.end(), ::isdigit)) { - // std::cerr << "Error: Pattern '" << pattern << "' contains numerical characters.\n"; - // exit(EXIT_FAILURE); - // } - - // unmaskSequence(pattern); - - // std::cout << "Adding pattern: " << pattern << " and its reverse complement" << "\n"; - // userInput.patterns.emplace_back(pattern); - // userInput.patterns.emplace_back(revCom(pattern)); - // } - - // if (userInput.patterns.empty()) { - // userInput.patterns = {"TTAGGG", "CCCTAA"}; - // std::cout << "No patterns provided. Only scanning for canonical patterns: TTAGGG, CCCTAA" << "\n"; - // } else { - // // Remove duplicates - // std::sort(userInput.patterns.begin(), userInput.patterns.end()); - // auto last = std::unique(userInput.patterns.begin(), userInput.patterns.end()); - // userInput.patterns.erase(last, userInput.patterns.end()); - // } - // } - // break; - case 'p': { std::istringstream patternStream(optarg); @@ -212,7 +182,7 @@ int main(int argc, char **argv) { // Add each combination and its reverse complement to userInput.patterns for (const std::string &comb : combinations) { - std::cout << "Adding pattern: " << comb << " and its reverse complement" << "\n"; + std::cout << "Adding pattern: " << comb << " and its reverse complement: " << revCom(comb) << "\n"; userInput.patterns.emplace_back(comb); userInput.patterns.emplace_back(revCom(comb)); } @@ -316,6 +286,14 @@ int main(int argc, char **argv) { case 'k': userInput.keepWindowData = true; break; + + case 'l': + userInput.minBlockLen = std::stoi(optarg); + break; + + case 'd': + userInput.maxBlockDist = std::stoi(optarg); + break; } if (argc == 2 || // handle various cases in which the output should include summary stats diff --git a/src/teloscope.cpp b/src/teloscope.cpp index 8fbce42..f893362 100644 --- a/src/teloscope.cpp +++ b/src/teloscope.cpp @@ -161,40 +161,40 @@ std::vector Teloscope::getTelomereBlocks(const std::vector Teloscope::mergeTelomereBlocks(const std::vector& winBlocks) { std::vector mergedBlocks; + unsigned short int minBlockLen = userInput.minBlockLen; + unsigned short int minBlockDist = userInput.minBlockDist; + unsigned short int maxBlockDist = userInput.maxBlockDist; if (winBlocks.empty()) { return mergedBlocks; // No blocks to merge } - // Initialize the first block as the current merged block - TelomereBlock currentBlock = winBlocks[0]; - uint16_t D = this->trie.getLongestPatternSize(); // Use D as the merging distance threshold + TelomereBlock currentBlock = winBlocks[0]; // Initialize the first block as the current block + // uint16_t D = this->trie.getLongestPatternSize(); // Use D as the merging distance threshold for (size_t i = 1; i < winBlocks.size(); ++i) { const TelomereBlock& nextBlock = winBlocks[i]; uint64_t currentEnd = currentBlock.start + currentBlock.blockLen; uint64_t distance = nextBlock.start - currentEnd; - if (distance <= 2 * D) { - // Merge the blocks by extending the current block + if ((distance <= maxBlockDist && distance > minBlockDist) || distance == 0) { uint64_t newEnd = nextBlock.start + nextBlock.blockLen; currentBlock.blockLen = newEnd - currentBlock.start; + } else { - // Add the current block to the merged blocks - mergedBlocks.push_back(currentBlock); - // Start a new current block - currentBlock = nextBlock; + if (currentBlock.blockLen >= minBlockLen) { + mergedBlocks.push_back(currentBlock); + } + currentBlock = nextBlock; // Start a new block } } - // Add the last current block to merged blocks mergedBlocks.push_back(currentBlock); return mergedBlocks; } - void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData) { windowData.windowStart = windowStart; // CHECK: Why is this here? unsigned short int longestPatternSize = this->trie.getLongestPatternSize(); @@ -225,7 +225,6 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, W if (current->isEndOfWord) { std::string pattern = window.substr(i, j - i + 1); - // bool isCanonical = (pattern == "TTAGGG" || pattern == "CCCTAA"); // Check canonical patterns bool isCanonical = (pattern == userInput.canonicalPatterns.first || pattern == userInput.canonicalPatterns.second); // Check canonical patterns // Update windowData from prevOverlapData @@ -261,7 +260,7 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope std::vector windows; uint32_t windowStart = 0; - uint32_t currentWindowSize = std::min(userInput.windowSize, static_cast(sequence.size())); // In case first segment is short + uint32_t currentWindowSize = std::min(windowSize, static_cast(sequenceSize)); // In case first segment is short std::string window = sequence.substr(0, currentWindowSize); std::unordered_map> segmentBlocks = { @@ -298,18 +297,19 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope }; for (const auto& [groupName, matches] : patternMatches) { - if (matches.size() >= 2) { // Jack: Conditional either here or within function - segmentBlocks[groupName] = getTelomereBlocks(matches, windowData.windowStart); + if (windowData.canonicalCounts >= 2 || windowData.nonCanonicalCounts >= 4) { // JACK: Add to user cutoffs + auto winBlocks = getTelomereBlocks(matches, windowData.windowStart); + segmentBlocks[groupName].insert(segmentBlocks[groupName].end(), winBlocks.begin(), winBlocks.end()); } } // Pass and reset overlap data prevOverlapData = nextOverlapData; - nextOverlapData = WindowData(); // Reset nextOverlapData for the next iteration + nextOverlapData = WindowData(); // Reset for next iteration // Prepare next window windowStart += step; - if (windowStart >= sequence.size()) { + if (windowStart >= sequenceSize) { break; } @@ -340,21 +340,35 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile, - std::unordered_map& patternMatchFiles, - std::unordered_map& patternCountFiles, - std::unordered_map& patternDensityFiles, - std::ofstream& telomereBlocksFile) { + std::unordered_map& patternMatchFiles, + std::unordered_map& patternCountFiles, + std::unordered_map& patternDensityFiles, + std::ofstream& allBlocksFile, + std::ofstream& canonicalBlocksFile, + std::ofstream& noncanonicalBlocksFile) { for (const auto& pathData : allPathData) { - // const auto& seqPos = pathData.seqPos; const auto& header = pathData.header; const auto& windows = pathData.windows; // Process telomere blocks for (const auto& [groupName, blocks] : pathData.mergedBlocks) { - for (const auto& block : blocks) { - uint64_t blockEnd = block.start + block.blockLen; - telomereBlocksFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n"; + std::ofstream* outputFile = nullptr; + + // By group + if (groupName == "all") { + outputFile = &allBlocksFile; + } else if (groupName == "canonical") { + outputFile = &canonicalBlocksFile; + } else if (groupName == "non-canonical") { + outputFile = &noncanonicalBlocksFile; + } + + if (outputFile) { + for (const auto& block : blocks) { + uint64_t blockEnd = block.start + block.blockLen; + *outputFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n"; + } } } @@ -413,19 +427,21 @@ void Teloscope::handleBEDFile() { std::unordered_map patternMatchFiles; // Jack: replace with vector to reduce cache locality? std::unordered_map patternCountFiles; std::unordered_map patternDensityFiles; - std::ofstream telomereBlocksFile; + std::ofstream allBlocksFile; + std::ofstream canonicalBlocksFile; + std::ofstream noncanonicalBlocksFile; std::cout << "Reporting window matches and metrics in BED/BEDgraphs...\n"; - // Open files once if their modes are enabled - if (userInput.modeEntropy) { + // Open files for writing + if (userInput.keepWindowData && userInput.modeEntropy) { shannonFile.open(userInput.outRoute + "/shannonEntropy.bedgraph"); } - if (userInput.modeGC) { + if (userInput.keepWindowData && userInput.modeGC) { gcContentFile.open(userInput.outRoute + "/gcContent.bedgraph"); } - if (userInput.modeMatch) { + if (userInput.keepWindowData && userInput.modeMatch) { for (const auto& pattern : userInput.patterns) { patternMatchFiles[pattern].open(userInput.outRoute + "/" + pattern + "_matches.bed"); @@ -434,10 +450,13 @@ void Teloscope::handleBEDFile() { } } - telomereBlocksFile.open(userInput.outRoute + "/telomere_blocks.bed"); + allBlocksFile.open(userInput.outRoute + "/telomere_blocks_all.bed"); + canonicalBlocksFile.open(userInput.outRoute + "/telomere_blocks_canonical.bed"); + noncanonicalBlocksFile.open(userInput.outRoute + "/telomere_blocks_noncanonical.bed"); - // Write data for each window - writeBEDFile(shannonFile, gcContentFile, patternMatchFiles, patternCountFiles, patternDensityFiles, telomereBlocksFile); + // Pass the files to writeBEDFile + writeBEDFile(shannonFile, gcContentFile, patternMatchFiles, patternCountFiles, patternDensityFiles, + allBlocksFile, canonicalBlocksFile, noncanonicalBlocksFile); // Close all files once if (userInput.modeEntropy) { @@ -458,7 +477,9 @@ void Teloscope::handleBEDFile() { } } - telomereBlocksFile.close(); + allBlocksFile.close(); + canonicalBlocksFile.close(); + noncanonicalBlocksFile.close(); } void Teloscope::printSummary() {