Skip to content

Commit

Permalink
Oct 15, 2024: Merge fixed and flags -l -d
Browse files Browse the repository at this point in the history
  • Loading branch information
AldhairMedico committed Oct 15, 2024
1 parent 8d75a95 commit 14be67d
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 80 deletions.
14 changes: 7 additions & 7 deletions include/input.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
struct UserInputTeloscope : UserInput {

std::string outRoute;
std::pair<std::string, std::string> canonicalPatterns;
std::vector<std::string> patterns;
uint32_t windowSize = 1000;
uint8_t kmerLen = 21;
uint32_t step = 500;
std::pair<std::string, std::string> canonicalPatterns;
unsigned short int minBlockLen = 24;
unsigned short int minBlockDist = 5;
unsigned short int maxBlockDist = 100;

double maxMem = 0;
std::string prefix = ".", outFile = "";
bool keepWindowData = false; // Memory intensive
bool modeMatch = true, modeEntropy = true, modeGC = true; // Change to: de novo, user-defined
float densityThreshold = 0.5f; // Threshold for telomere detection
uint32_t maxGaps = 2; // Allowed gaps in telomere blocks
uint8_t blockDistance = 6;


double maxMem = 0;
std::string prefix = ".", outFile = "";
};

class Input {
Expand Down
10 changes: 6 additions & 4 deletions include/teloscope.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,12 @@ class Teloscope {
std::vector<TelomereBlock> mergeTelomereBlocks(const std::vector<TelomereBlock>& winBlocks);

void writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles,
std::ofstream& telomereBlocksFile);
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles,
std::ofstream& allBlocksFile,
std::ofstream& canonicalBlocksFile,
std::ofstream& noncanonicalBlocksFile);

void handleBEDFile();

Expand Down
48 changes: 13 additions & 35 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ int main(int argc, char **argv) {
{"threads", required_argument, 0, 'j'},
{"keep-window-data", no_argument, 0, 'k'},
{"mode", required_argument, 0, 'm'},

{"min-block-length", required_argument, 0, 'l'},
{"max-block-distance", required_argument, 0, 'd'},

{"verbose", no_argument, &verbose_flag, 1},
{"cmd", no_argument, &cmd_flag, 1},
{"version", no_argument, 0, 'v'},
Expand All @@ -61,7 +63,7 @@ int main(int argc, char **argv) {

int option_index = 0;

c = getopt_long(argc, argv, "-:f:j:m:o:p:s:w:c:kvh", long_options, &option_index);
c = getopt_long(argc, argv, "-:f:j:m:o:p:s:w:c:l:d:kvh", long_options, &option_index);

// if (optind < argc && !isPipe) { // if pipe wasn't assigned already

Expand Down Expand Up @@ -158,38 +160,6 @@ int main(int argc, char **argv) {
break;


// case 'p':
// {
// std::istringstream patternStream(optarg);
// std::string pattern;

// while (std::getline(patternStream, pattern, ',')) {
// if (pattern.empty()) continue;

// if (std::any_of(pattern.begin(), pattern.end(), ::isdigit)) {
// std::cerr << "Error: Pattern '" << pattern << "' contains numerical characters.\n";
// exit(EXIT_FAILURE);
// }

// unmaskSequence(pattern);

// std::cout << "Adding pattern: " << pattern << " and its reverse complement" << "\n";
// userInput.patterns.emplace_back(pattern);
// userInput.patterns.emplace_back(revCom(pattern));
// }

// if (userInput.patterns.empty()) {
// userInput.patterns = {"TTAGGG", "CCCTAA"};
// std::cout << "No patterns provided. Only scanning for canonical patterns: TTAGGG, CCCTAA" << "\n";
// } else {
// // Remove duplicates
// std::sort(userInput.patterns.begin(), userInput.patterns.end());
// auto last = std::unique(userInput.patterns.begin(), userInput.patterns.end());
// userInput.patterns.erase(last, userInput.patterns.end());
// }
// }
// break;

case 'p':
{
std::istringstream patternStream(optarg);
Expand All @@ -212,7 +182,7 @@ int main(int argc, char **argv) {

// Add each combination and its reverse complement to userInput.patterns
for (const std::string &comb : combinations) {
std::cout << "Adding pattern: " << comb << " and its reverse complement" << "\n";
std::cout << "Adding pattern: " << comb << " and its reverse complement: " << revCom(comb) << "\n";
userInput.patterns.emplace_back(comb);
userInput.patterns.emplace_back(revCom(comb));
}
Expand Down Expand Up @@ -316,6 +286,14 @@ int main(int argc, char **argv) {
case 'k':
userInput.keepWindowData = true;
break;

case 'l':
userInput.minBlockLen = std::stoi(optarg);
break;

case 'd':
userInput.maxBlockDist = std::stoi(optarg);
break;
}

if (argc == 2 || // handle various cases in which the output should include summary stats
Expand Down
89 changes: 55 additions & 34 deletions src/teloscope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,40 +161,40 @@ std::vector<TelomereBlock> Teloscope::getTelomereBlocks(const std::vector<uint32

std::vector<TelomereBlock> Teloscope::mergeTelomereBlocks(const std::vector<TelomereBlock>& winBlocks) {
std::vector<TelomereBlock> mergedBlocks;
unsigned short int minBlockLen = userInput.minBlockLen;
unsigned short int minBlockDist = userInput.minBlockDist;
unsigned short int maxBlockDist = userInput.maxBlockDist;

if (winBlocks.empty()) {
return mergedBlocks; // No blocks to merge
}

// Initialize the first block as the current merged block
TelomereBlock currentBlock = winBlocks[0];
uint16_t D = this->trie.getLongestPatternSize(); // Use D as the merging distance threshold
TelomereBlock currentBlock = winBlocks[0]; // Initialize the first block as the current block
// uint16_t D = this->trie.getLongestPatternSize(); // Use D as the merging distance threshold

for (size_t i = 1; i < winBlocks.size(); ++i) {
const TelomereBlock& nextBlock = winBlocks[i];
uint64_t currentEnd = currentBlock.start + currentBlock.blockLen;
uint64_t distance = nextBlock.start - currentEnd;

if (distance <= 2 * D) {
// Merge the blocks by extending the current block
if ((distance <= maxBlockDist && distance > minBlockDist) || distance == 0) {
uint64_t newEnd = nextBlock.start + nextBlock.blockLen;
currentBlock.blockLen = newEnd - currentBlock.start;

} else {
// Add the current block to the merged blocks
mergedBlocks.push_back(currentBlock);
// Start a new current block
currentBlock = nextBlock;
if (currentBlock.blockLen >= minBlockLen) {
mergedBlocks.push_back(currentBlock);
}
currentBlock = nextBlock; // Start a new block
}
}

// Add the last current block to merged blocks
mergedBlocks.push_back(currentBlock);

return mergedBlocks;
}



void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData) {
windowData.windowStart = windowStart; // CHECK: Why is this here?
unsigned short int longestPatternSize = this->trie.getLongestPatternSize();
Expand Down Expand Up @@ -225,7 +225,6 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, W

if (current->isEndOfWord) {
std::string pattern = window.substr(i, j - i + 1);
// bool isCanonical = (pattern == "TTAGGG" || pattern == "CCCTAA"); // Check canonical patterns
bool isCanonical = (pattern == userInput.canonicalPatterns.first || pattern == userInput.canonicalPatterns.second); // Check canonical patterns

// Update windowData from prevOverlapData
Expand Down Expand Up @@ -261,7 +260,7 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope

std::vector<WindowData> windows;
uint32_t windowStart = 0;
uint32_t currentWindowSize = std::min(userInput.windowSize, static_cast<uint32_t>(sequence.size())); // In case first segment is short
uint32_t currentWindowSize = std::min(windowSize, static_cast<uint32_t>(sequenceSize)); // In case first segment is short
std::string window = sequence.substr(0, currentWindowSize);

std::unordered_map<std::string, std::vector<TelomereBlock>> segmentBlocks = {
Expand Down Expand Up @@ -298,18 +297,19 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope
};

for (const auto& [groupName, matches] : patternMatches) {
if (matches.size() >= 2) { // Jack: Conditional either here or within function
segmentBlocks[groupName] = getTelomereBlocks(matches, windowData.windowStart);
if (windowData.canonicalCounts >= 2 || windowData.nonCanonicalCounts >= 4) { // JACK: Add to user cutoffs
auto winBlocks = getTelomereBlocks(matches, windowData.windowStart);
segmentBlocks[groupName].insert(segmentBlocks[groupName].end(), winBlocks.begin(), winBlocks.end());
}
}

// Pass and reset overlap data
prevOverlapData = nextOverlapData;
nextOverlapData = WindowData(); // Reset nextOverlapData for the next iteration
nextOverlapData = WindowData(); // Reset for next iteration

// Prepare next window
windowStart += step;
if (windowStart >= sequence.size()) {
if (windowStart >= sequenceSize) {
break;
}

Expand Down Expand Up @@ -340,21 +340,35 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope


void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles,
std::ofstream& telomereBlocksFile) {
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles,
std::ofstream& allBlocksFile,
std::ofstream& canonicalBlocksFile,
std::ofstream& noncanonicalBlocksFile) {

for (const auto& pathData : allPathData) {
// const auto& seqPos = pathData.seqPos;
const auto& header = pathData.header;
const auto& windows = pathData.windows;

// Process telomere blocks
for (const auto& [groupName, blocks] : pathData.mergedBlocks) {
for (const auto& block : blocks) {
uint64_t blockEnd = block.start + block.blockLen;
telomereBlocksFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n";
std::ofstream* outputFile = nullptr;

// By group
if (groupName == "all") {
outputFile = &allBlocksFile;
} else if (groupName == "canonical") {
outputFile = &canonicalBlocksFile;
} else if (groupName == "non-canonical") {
outputFile = &noncanonicalBlocksFile;
}

if (outputFile) {
for (const auto& block : blocks) {
uint64_t blockEnd = block.start + block.blockLen;
*outputFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n";
}
}
}

Expand Down Expand Up @@ -413,19 +427,21 @@ void Teloscope::handleBEDFile() {
std::unordered_map<std::string, std::ofstream> patternMatchFiles; // Jack: replace with vector to reduce cache locality?
std::unordered_map<std::string, std::ofstream> patternCountFiles;
std::unordered_map<std::string, std::ofstream> patternDensityFiles;
std::ofstream telomereBlocksFile;
std::ofstream allBlocksFile;
std::ofstream canonicalBlocksFile;
std::ofstream noncanonicalBlocksFile;
std::cout << "Reporting window matches and metrics in BED/BEDgraphs...\n";

// Open files once if their modes are enabled
if (userInput.modeEntropy) {
// Open files for writing
if (userInput.keepWindowData && userInput.modeEntropy) {
shannonFile.open(userInput.outRoute + "/shannonEntropy.bedgraph");
}

if (userInput.modeGC) {
if (userInput.keepWindowData && userInput.modeGC) {
gcContentFile.open(userInput.outRoute + "/gcContent.bedgraph");
}

if (userInput.modeMatch) {
if (userInput.keepWindowData && userInput.modeMatch) {

for (const auto& pattern : userInput.patterns) {
patternMatchFiles[pattern].open(userInput.outRoute + "/" + pattern + "_matches.bed");
Expand All @@ -434,10 +450,13 @@ void Teloscope::handleBEDFile() {
}
}

telomereBlocksFile.open(userInput.outRoute + "/telomere_blocks.bed");
allBlocksFile.open(userInput.outRoute + "/telomere_blocks_all.bed");
canonicalBlocksFile.open(userInput.outRoute + "/telomere_blocks_canonical.bed");
noncanonicalBlocksFile.open(userInput.outRoute + "/telomere_blocks_noncanonical.bed");

// Write data for each window
writeBEDFile(shannonFile, gcContentFile, patternMatchFiles, patternCountFiles, patternDensityFiles, telomereBlocksFile);
// Pass the files to writeBEDFile
writeBEDFile(shannonFile, gcContentFile, patternMatchFiles, patternCountFiles, patternDensityFiles,
allBlocksFile, canonicalBlocksFile, noncanonicalBlocksFile);

// Close all files once
if (userInput.modeEntropy) {
Expand All @@ -458,7 +477,9 @@ void Teloscope::handleBEDFile() {
}
}

telomereBlocksFile.close();
allBlocksFile.close();
canonicalBlocksFile.close();
noncanonicalBlocksFile.close();
}

void Teloscope::printSummary() {
Expand Down

0 comments on commit 14be67d

Please sign in to comment.