Skip to content

Commit

Permalink
Dec 09, 2024: hasCanDimer and filterBlocks
Browse files Browse the repository at this point in the history
  • Loading branch information
AldhairMedico committed Dec 10, 2024
1 parent 2dba771 commit e6e0ab4
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 21 deletions.
3 changes: 2 additions & 1 deletion include/input.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ struct UserInputTeloscope : UserInput {
uint32_t windowSize = 1000;
uint8_t kmerLen = 21;
uint32_t step = 500;
unsigned short int minBlockLen = 100; // Not used anymore
unsigned short int minBlockLen = 2000; // Only for all blocks
unsigned short int maxBlockDist = 200;
unsigned short int minBlockCounts = 2;
uint32_t terminalLimit = 50000;
Expand All @@ -29,6 +29,7 @@ struct UserInputTeloscope : UserInput {
bool outNonCanMatches = true;
bool outITS = true;

// bool outPQonly = true;
// bool outDistance = true;
// bool outFasta = true;

Expand Down
13 changes: 8 additions & 5 deletions include/teloscope.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,22 @@ struct WindowData {
float gcContent;
float shannonEntropy;
// uint32_t winHDistance = 0;


std::vector<uint32_t> winMatches;
std::vector<uint8_t> hDistances;
uint16_t canonicalCounts = 0;
uint16_t nonCanonicalCounts = 0;
float canonicalDensity = 0.0f;
float nonCanonicalDensity = 0.0f;
bool hasCanDimer = false;

WindowData() : windowStart(0), currentWindowSize(0), gcContent(0.0f), shannonEntropy(0.0f) {}
WindowData() : windowStart(0), currentWindowSize(0), gcContent(0.0f), shannonEntropy(0.0f), hasCanDimer(false) {}
};

struct SegmentData {
std::vector<WindowData> windows;
std::vector<TelomereBlock> terminalBlocks;
std::vector<TelomereBlock> interstitialBlocks;
// std::vector<TelomereBlock> terminalBlocks;
// std::vector<TelomereBlock> interstitialBlocks;
std::unordered_map<std::string, std::vector<TelomereBlock>> mergedBlocks;
std::vector<uint32_t> canonicalMatches;
std::vector<uint32_t> nonCanonicalMatches;
Expand Down Expand Up @@ -135,7 +137,6 @@ class Teloscope {

bool walkPath(InPath* path, std::vector<InSegment*> &inSegments, std::vector<InGap> &inGaps);

// void analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData, SegmentData& segmentData);
void analyzeWindow(const std::string &window, uint32_t windowStart,
WindowData& windowData, WindowData& nextOverlapData,
SegmentData& segmentData, uint32_t segmentSize);
Expand All @@ -148,6 +149,8 @@ class Teloscope {

std::vector<TelomereBlock> getTelomereBlocks(const std::vector<uint32_t>& inputMatches, uint16_t mergeDist);

std::vector<TelomereBlock> filterBlocks(const std::vector<TelomereBlock>& blocks);

void writeBEDFile(std::ofstream& windowMetricsFile, std::ofstream& windowRepeatsFile,
std::ofstream& canonicalMatchFile, std::ofstream& noncanonicalMatchFile,
std::ofstream& allBlocksFile, std::ofstream& canonicalBlocksFile);
Expand Down
5 changes: 5 additions & 0 deletions src/input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,11 @@ bool Teloscope::walkPath(InPath* path, std::vector<InSegment*> &inSegments, std:

}

// Filter "all" blocks at the path level
if (pathData.mergedBlocks.find("all") != pathData.mergedBlocks.end()) {
pathData.mergedBlocks["all"] = filterBlocks(pathData.mergedBlocks["all"]);
}

std::lock_guard<std::mutex> lck(mtx);
allPathData.push_back(std::move(pathData));

Expand Down
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include <input.h> // check


std::string version = "0.0.4";
std::string version = "0.0.5";

// global
std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
Expand Down
60 changes: 46 additions & 14 deletions src/teloscope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,34 @@ std::vector<TelomereBlock> Teloscope::getTelomereBlocks(const std::vector<uint32
}


std::vector<TelomereBlock> Teloscope::filterBlocks(const std::vector<TelomereBlock>& blocks) {
std::vector<TelomereBlock> filteredBlocks;

for (const auto& block : blocks) {
if (block.blockLen >= userInput.minBlockLen) { // Filter by length
filteredBlocks.push_back(block);
}
}

if (filteredBlocks.size() > 2) { // Keep the two largest blocks
std::sort(filteredBlocks.begin(), filteredBlocks.end(), [](const TelomereBlock& a, const TelomereBlock& b) {
return a.blockLen > b.blockLen;
});
filteredBlocks.resize(2);
}

// Ensure the blocks are ordered by their start positions
std::sort(filteredBlocks.begin(), filteredBlocks.end(), [](const TelomereBlock& a, const TelomereBlock& b) {
return a.start < b.start;
});

return filteredBlocks;
}


void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart,
WindowData& windowData, WindowData& nextOverlapData,
SegmentData& segmentData, uint32_t segmentSize) {
// void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData, WindowData& nextOverlapData, SegmentData& segmentData) {

windowData.windowStart = windowStart; // CHECK: Why is this here?
unsigned short int longestPatternSize = this->trie.getLongestPatternSize();
Expand All @@ -142,6 +165,7 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart,

// Determine starting index for Trie scanning
uint32_t startIndex = (windowStart == 0 || overlapSize == 0) ? 0 : std::min(userInput.step - longestPatternSize, overlapSize - longestPatternSize);
int lastCanonicalPos = -1;

for (uint32_t i = startIndex; i < window.size(); ++i) { // Nucleotide iterations

Expand Down Expand Up @@ -177,39 +201,42 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart,
std::string pattern = window.substr(i, j - i + 1);
bool isCanonical = (pattern == userInput.canonicalPatterns.first || pattern == userInput.canonicalPatterns.second); // Check canonical patterns
bool isTerminal = (windowStart + i <= terminalLimit || windowStart + i >= segmentSize - terminalLimit);
uint8_t patternSize = pattern.size();
float densityGain = static_cast<float>(pattern.size()) / window.size();
uint32_t matchPos = windowStart + i;

// Update windowData from prevOverlapData
if (j >= overlapSize || overlapSize == 0 || windowStart == 0) {
float densityGain = static_cast<float>(patternSize) / window.size();
uint32_t matchPos = windowStart + i;

if (isCanonical) {
windowData.canonicalCounts++;
windowData.canonicalDensity += densityGain;
segmentData.canonicalMatches.push_back(matchPos);

// Check for canonical dimer
if (!windowData.hasCanDimer && lastCanonicalPos >= 0 && (matchPos - lastCanonicalPos) <= userInput.canonicalSize) {
windowData.hasCanDimer = true;
}
lastCanonicalPos = matchPos; // Update last canonical position

} else {
windowData.nonCanonicalCounts++;
windowData.nonCanonicalDensity += densityGain;
}
if (isTerminal) {
segmentData.segMatches.push_back(matchPos);
windowData.winMatches.push_back(matchPos);
if (!isCanonical) {
segmentData.nonCanonicalMatches.push_back(matchPos);
}
}
// windowData.hDistances.push_back(userInput.hammingDistances[pattern]);
// windowData.winHDistance += userInput.hammingDistances[pattern];
}

// Update nextOverlapData
if (i >= userInput.step && overlapSize != 0 ) {
if (isCanonical) {
nextOverlapData.canonicalCounts++;
nextOverlapData.canonicalDensity += static_cast<float>(patternSize) / window.size();
nextOverlapData.canonicalDensity += densityGain;
} else {
nextOverlapData.nonCanonicalCounts++;
nextOverlapData.nonCanonicalDensity += static_cast<float>(patternSize) / window.size();
nextOverlapData.nonCanonicalDensity += densityGain;
}
}
}
Expand Down Expand Up @@ -243,7 +270,6 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope
while (windowStart < segmentSize) {
// Prepare and analyze current window
WindowData windowData = prevOverlapData;
// analyzeWindow(window, windowStart, windowData, nextOverlapData, segmentData);
analyzeWindow(window, windowStart, windowData, nextOverlapData, segmentData, segmentSize);

if (userInput.modeGC) { windowData.gcContent = getGCContent(windowData.nucleotideCounts, window.size()); }
Expand All @@ -254,10 +280,16 @@ SegmentData Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope
windowData.currentWindowSize = currentWindowSize;
if (userInput.keepWindowData) { windows.emplace_back(windowData); } // User defines if windowData is stored

// Pass and reset overlap data
prevOverlapData = nextOverlapData;
prevOverlapData = nextOverlapData; // Pass and reset overlap data
nextOverlapData = WindowData(); // Reset for next iteration

// Keep all in presence of canonical matches
if (windowData.hasCanDimer) {
segmentData.segMatches.insert(segmentData.segMatches.end(),
windowData.winMatches.begin(),
windowData.winMatches.end());
}

// Prepare next window
windowStart += step;
if (windowStart >= segmentSize) {
Expand Down Expand Up @@ -325,7 +357,7 @@ void Teloscope::writeBEDFile(std::ofstream& windowMetricsFile, std::ofstream& wi
if (outputFile) {
for (const auto& block : blocks) {
uint64_t blockEnd = block.start + block.blockLen;
*outputFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n";
*outputFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << block.blockLen << "\n";
}
}
}
Expand Down

0 comments on commit e6e0ab4

Please sign in to comment.