Skip to content

Commit

Permalink
Nov 12, 2024: Better memory usage - clean
Browse files Browse the repository at this point in the history
  • Loading branch information
AldhairMedico committed Nov 12, 2024
1 parent 7dcfc57 commit 97dc44c
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 192 deletions.
2 changes: 1 addition & 1 deletion include/input.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ struct UserInputTeloscope : UserInput {
uint32_t windowSize = 1000;
uint8_t kmerLen = 21;
uint32_t step = 500;
unsigned short int minBlockLen = 500;
unsigned short int minBlockLen = 50;
unsigned short int maxBlockDist = 50;

bool keepWindowData = false; // Memory intensive
Expand Down
26 changes: 0 additions & 26 deletions include/teloscope.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ class Trie {
};


// struct PatternData {
// std::vector<uint32_t> patMatches; // Match indexes in window
// uint32_t count = 0; // Total pattern count
// float density = 0.0f; // Density of the pattern
// };


struct TelomereBlock {
uint64_t start;
uint16_t blockLen; // End = start + blockLen
Expand All @@ -66,7 +59,6 @@ struct WindowData {
float shannonEntropy;
// uint32_t winHDistance = 0;

// std::unordered_map<std::string, PatternData> patternMap; // Condensed pattern data
std::vector<TelomereBlock> winBlocks;
std::vector<uint8_t> hDistances;

Expand Down Expand Up @@ -124,16 +116,6 @@ class Teloscope {
return static_cast<float>(gcCount) / windowSize * 100.0;
}


// inline void getPatternDensities(WindowData& windowData, uint32_t windowSize) {
// for (auto &entry : windowData.patternMap) {
// auto &pattern = entry.first;
// auto &data = entry.second;
// data.density = static_cast<float>(data.count * pattern.size()) / windowSize;
// }
// }


float getMean(const std::vector<float>& values);
float getMedian(std::vector<float> values);
float getMin(const std::vector<float> values);
Expand Down Expand Up @@ -162,14 +144,6 @@ class Teloscope {

std::vector<TelomereBlock> mergeTelomereBlocks(const std::vector<TelomereBlock>& winBlocks);

// void writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
// std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
// std::unordered_map<std::string, std::ofstream>& patternCountFiles,
// std::unordered_map<std::string, std::ofstream>& patternDensityFiles,
// std::ofstream& allBlocksFile,
// std::ofstream& canonicalBlocksFile,
// std::ofstream& noncanonicalBlocksFile);

void writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
std::ofstream& canonicalMatchFile, std::ofstream& noncanonicalMatchFile,
std::ofstream& canonicalCountFile, std::ofstream& noncanonicalCountFile,
Expand Down
186 changes: 21 additions & 165 deletions src/teloscope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope

if (userInput.modeGC) { windowData.gcContent = getGCContent(windowData.nucleotideCounts, window.size()); }
if (userInput.modeEntropy) { windowData.shannonEntropy = getShannonEntropy(windowData.nucleotideCounts, window.size()); }
// if (userInput.modeMatch) { getPatternDensities(windowData, window.size()); }

windowData.canonicalDensity = static_cast<float>(windowData.canonicalCounts) / window.size();
windowData.nonCanonicalDensity = static_cast<float>(windowData.nonCanonicalCounts) / window.size();

Expand Down Expand Up @@ -339,10 +339,10 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope


void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
std::ofstream& canonicalMatchFile, std::ofstream& noncanonicalMatchFile,
std::ofstream& canonicalCountFile, std::ofstream& noncanonicalCountFile,
std::ofstream& canonicalDensityFile, std::ofstream& noncanonicalDensityFile,
std::ofstream& allBlocksFile, std::ofstream& canonicalBlocksFile, std::ofstream& noncanonicalBlocksFile) {
std::ofstream& canonicalMatchFile, std::ofstream& noncanonicalMatchFile,
std::ofstream& canonicalCountFile, std::ofstream& noncanonicalCountFile,
std::ofstream& canonicalDensityFile, std::ofstream& noncanonicalDensityFile,
std::ofstream& allBlocksFile, std::ofstream& canonicalBlocksFile, std::ofstream& noncanonicalBlocksFile) {

for (const auto& pathData : allPathData) {
const auto& header = pathData.header;
Expand Down Expand Up @@ -389,8 +389,8 @@ void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcConten
// Write window GC content if enabled
if (userInput.modeGC) {
gcContentFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.gcContent << "\n";
<< windowEnd << "\t"
<< window.gcContent << "\n";
gcContentValues.push_back(window.gcContent);
}

Expand All @@ -399,32 +399,32 @@ void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcConten
// Write canonical matches
for (auto pos : window.canonicalMatches) {
canonicalMatchFile << header << "\t"
<< window.windowStart + pos << "\t"
<< window.windowStart + pos + 1 << "\t"
<< "canonical" << "\n";
<< window.windowStart + pos << "\t"
<< window.windowStart + pos + 1 << "\t"
<< "canonical" << "\n";
}

// Write non-canonical matches
for (auto pos : window.nonCanonicalMatches) {
noncanonicalMatchFile << header << "\t"
<< window.windowStart + pos << "\t"
<< window.windowStart + pos + 1 << "\t"
<< "non-canonical" << "\n";
<< window.windowStart + pos << "\t"
<< window.windowStart + pos + 1 << "\t"
<< "non-canonical" << "\n";
}

// Write count data
canonicalCountFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.canonicalCounts << "\n";
<< windowEnd << "\t"
<< window.canonicalCounts << "\n";

noncanonicalCountFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.nonCanonicalCounts << "\n";
<< windowEnd << "\t"
<< window.nonCanonicalCounts << "\n";

// Write density data
canonicalDensityFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.canonicalDensity << "\n";
<< windowEnd << "\t"
<< window.canonicalDensity << "\n";

noncanonicalDensityFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
Expand Down Expand Up @@ -473,8 +473,8 @@ void Teloscope::handleBEDFile() {

// Pass the files to writeBEDFile
writeBEDFile(shannonFile, gcContentFile, canonicalMatchFile, noncanonicalMatchFile,
canonicalCountFile, noncanonicalCountFile, canonicalDensityFile,
noncanonicalDensityFile, allBlocksFile, canonicalBlocksFile, noncanonicalBlocksFile);
canonicalCountFile, noncanonicalCountFile, canonicalDensityFile,
noncanonicalDensityFile, allBlocksFile, canonicalBlocksFile, noncanonicalBlocksFile);

// Close all files once
if (userInput.modeEntropy) {
Expand All @@ -498,150 +498,6 @@ void Teloscope::handleBEDFile() {
}



// void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
// std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
// std::unordered_map<std::string, std::ofstream>& patternCountFiles,
// std::unordered_map<std::string, std::ofstream>& patternDensityFiles,
// std::ofstream& allBlocksFile,
// std::ofstream& canonicalBlocksFile,
// std::ofstream& noncanonicalBlocksFile) {

// for (const auto& pathData : allPathData) {
// const auto& header = pathData.header;
// const auto& windows = pathData.windows;

// // Process telomere blocks
// for (const auto& [groupName, blocks] : pathData.mergedBlocks) {
// std::ofstream* outputFile = nullptr;

// // By group
// if (groupName == "all") {
// outputFile = &allBlocksFile;
// } else if (groupName == "canonical") {
// outputFile = &canonicalBlocksFile;
// } else if (groupName == "non-canonical") {
// outputFile = &noncanonicalBlocksFile;
// }

// if (outputFile) {
// for (const auto& block : blocks) {
// uint64_t blockEnd = block.start + block.blockLen;
// *outputFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n";
// }
// }
// }

// if (!userInput.keepWindowData) { // If windowData is not stored, return
// continue;
// }

// // Process window data
// for (const auto& window : windows) {
// totalNWindows++; // Update total window count
// uint32_t windowEnd = window.windowStart + window.currentWindowSize; // Start is already 0-based

// // Write window Shannon entropy if enabled
// if (userInput.modeEntropy) {
// shannonFile << header << "\t" << window.windowStart << "\t"
// << windowEnd << "\t"
// << window.shannonEntropy << "\n";
// entropyValues.push_back(window.shannonEntropy); // Update entropy values
// }

// // Write window GC content if enabled
// if (userInput.modeGC) {
// gcContentFile << header << "\t" << window.windowStart << "\t"
// << windowEnd << "\t"
// << window.gcContent << "\n";
// gcContentValues.push_back(window.gcContent);
// }

// // Write pattern data if enabled
// if (userInput.modeMatch) {
// for (const auto& [pattern, data] : window.patternMap) {
// for (auto pos : data.patMatches) {
// patternMatchFiles[pattern] << header << "\t"
// << window.windowStart + pos << "\t"
// << window.windowStart + pos + pattern.length() << "\t" // Start is already 0-based
// << pattern << "\n";
// patternCounts[pattern]++; // Update total pattern counts
// }

// patternCountFiles[pattern] << header << "\t" << window.windowStart << "\t"
// << windowEnd << "\t"
// << data.count << "\n";

// patternDensityFiles[pattern] << header << "\t" << window.windowStart << "\t"
// << windowEnd << "\t"
// << data.density << "\n";
// }
// }
// }
// }
// }

// void Teloscope::handleBEDFile() {
// std::ofstream shannonFile;
// std::ofstream gcContentFile;
// std::unordered_map<std::string, std::ofstream> patternMatchFiles; // Jack: replace with vector to reduce cache locality?
// std::unordered_map<std::string, std::ofstream> patternCountFiles;
// std::unordered_map<std::string, std::ofstream> patternDensityFiles;
// std::ofstream allBlocksFile;
// std::ofstream canonicalBlocksFile;
// std::ofstream noncanonicalBlocksFile;
// std::cout << "Reporting window matches and metrics in BED/BEDgraphs...\n";

// // Open files for writing
// if (userInput.keepWindowData && userInput.modeEntropy) {
// shannonFile.open(userInput.outRoute + "/shannonEntropy.bedgraph");
// }

// if (userInput.keepWindowData && userInput.modeGC) {
// gcContentFile.open(userInput.outRoute + "/gcContent.bedgraph");
// }

// if (userInput.keepWindowData && userInput.modeMatch) {

// for (const auto& pattern : userInput.patterns) {
// patternMatchFiles[pattern].open(userInput.outRoute + "/" + pattern + "_matches.bed");
// patternCountFiles[pattern].open(userInput.outRoute + "/" + pattern + "_count.bedgraph");
// patternDensityFiles[pattern].open(userInput.outRoute + "/" + pattern + "_density.bedgraph");
// }
// }

// allBlocksFile.open(userInput.outRoute + "/telomere_blocks_all.bed");
// canonicalBlocksFile.open(userInput.outRoute + "/telomere_blocks_canonical.bed");
// noncanonicalBlocksFile.open(userInput.outRoute + "/telomere_blocks_noncanonical.bed");

// // Pass the files to writeBEDFile
// writeBEDFile(shannonFile, gcContentFile, patternMatchFiles, patternCountFiles, patternDensityFiles,
// allBlocksFile, canonicalBlocksFile, noncanonicalBlocksFile);

// // Close all files once
// if (userInput.modeEntropy) {
// shannonFile.close();
// }
// if (userInput.modeGC) {
// gcContentFile.close();
// }
// if (userInput.modeMatch) {
// for (auto& [pattern, file] : patternMatchFiles) {
// file.close();
// }
// for (auto& [pattern, file] : patternCountFiles) {
// file.close();
// }
// for (auto& [pattern, file] : patternDensityFiles) {
// file.close();
// }
// }

// allBlocksFile.close();
// canonicalBlocksFile.close();
// noncanonicalBlocksFile.close();
// }

void Teloscope::printSummary() {
if (!userInput.keepWindowData) { // If windowData is not stored, skip
return;
Expand Down

0 comments on commit 97dc44c

Please sign in to comment.