Skip to content

Commit

Permalink
Merge pull request #16 from arangrhie/better
Browse files Browse the repository at this point in the history
better and loose modes
  • Loading branch information
arangrhie authored Dec 2, 2022
2 parents fc4f89a + 26b14c1 commit 39806a7
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 22 deletions.
2 changes: 1 addition & 1 deletion src/merfin/merfin-globals.C
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ merfinGlobal::open_Inputs(void) {
// Open VCF input. This is only needed for reportType OP_VAR_MER.
// main() checks that we have a vcfName.

if (reportType == OP_FILTER || reportType == OP_POLISH) {
if (reportType == OP_FILTER || reportType == OP_POLISH || reportType == OP_BETTER || reportType == OP_STRICT || reportType == OP_LOOSE) {
fprintf(stderr, "-- Opening vcf file '%s'.\n", vcfName);
inVcf = new vcfFile(vcfName);

Expand Down
4 changes: 3 additions & 1 deletion src/merfin/merfin-globals.H
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
#define OP_DUMP 3
#define OP_FILTER 4
#define OP_POLISH 5

#define OP_BETTER 6
#define OP_STRICT 7
#define OP_LOOSE 8

////////////////////////////////////////
//
Expand Down
23 changes: 14 additions & 9 deletions src/merfin/merfin-variants.C
Original file line number Diff line number Diff line change
Expand Up @@ -277,14 +277,25 @@ processVariants(void *G, void *T, void *S) {

// Generate output VCFs.

// Experimental: output vcf according to k*
// Best: output vcf according to k*
if (g->reportType == OP_POLISH) {
//fprintf(t->oVcf->file(), "%s", seqMer->bestVariant().c_str());
s->result += seqMer->bestVariant();
}

// Better: output vcf for reducing missing kmers only
} else if (g->reportType == OP_BETTER) {
s->result += seqMer->betterVariant();

// Strict: output vcf for reducing missing kmers only
} else if (g->reportType == OP_STRICT) {
s->result += seqMer->strictPolish();

// Loose: remove variants increasing missing kmers only
} else if (g->reportType == OP_LOOSE) {
s->result += seqMer->loosePolish();

// Filter vcf and print as it was in the original vcf, conservatively
else {
} else {
vector<vcfRecord*> records = seqMer->bestFilter();

for (uint64 i = 0; i < records.size(); i++)
Expand All @@ -293,17 +304,11 @@ processVariants(void *G, void *T, void *S) {
}

// Cleanup.

delete seqMer;
delete[] refTemplate;
} // Over posGTlist.
}






void
outputVariants(void *G, void *S) {
merfinGlobal *g = (merfinGlobal *)G;
Expand Down
19 changes: 13 additions & 6 deletions src/merfin/merfin.C
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,16 @@ main(int32 argc, char **argv) {
} else if (strcmp(argv[arg], "-filter") == 0) {
G->reportType = OP_FILTER;

} else if (strcmp(argv[arg], "-better") == 0) {
G->reportType = OP_BETTER;

} else if (strcmp(argv[arg], "-strict") == 0) {
G->reportType = OP_STRICT;

} else if (strcmp(argv[arg], "-loose") == 0) {
fprintf(stderr, "*EXPERIMENTAL* Running in -loose mode\n");
G->reportType = OP_LOOSE;

} else if (strcmp(argv[arg], "-polish") == 0) {
G->reportType = OP_POLISH;

Expand Down Expand Up @@ -146,15 +156,12 @@ main(int32 argc, char **argv) {

// Check inputs are present for the various modes.

if ((G->reportType == OP_HIST) ||
(G->reportType == OP_DUMP) ||
(G->reportType == OP_POLISH) ||
(G->reportType == OP_FILTER)) {
if (G->reportType != OP_COMPL) {
if (G->seqName == nullptr) err.push_back("No input sequences (-sequence) supplied.\n");
if (G->outName == nullptr) err.push_back("No output (-output) supplied.\n");
}

if (G->reportType == OP_POLISH || G->reportType == OP_FILTER) {
if (G->reportType == OP_POLISH || G->reportType == OP_FILTER || G->reportType == OP_BETTER || G->reportType == OP_STRICT || G->reportType == OP_LOOSE) {
if (G->vcfName == nullptr) err.push_back("No variant call input (-vcf) supplied; mandatory for -filter or -polish.\n");
}

Expand Down Expand Up @@ -326,7 +333,7 @@ main(int32 argc, char **argv) {
ss->setInOrderOutput(true);
}

if (G->reportType == OP_POLISH || G->reportType == OP_FILTER) {
if (G->reportType != OP_HIST && G->reportType != OP_DUMP && G->reportType != OP_COMPL) {
fprintf(stderr, "-- Generate variant mers and score them.\n");
ss = new sweatShop(loadSequence, processVariants, outputVariants);
ss->setInOrderOutput(false);
Expand Down
203 changes: 200 additions & 3 deletions src/merfin/varMer.C
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ varMer::score(merfinGlobal *g) {
seq = seqs.at(ii);
m_ks.clear();
m_dks.clear();
// fprintf(stderr, "\n[ DEBUG ]:: score %d th combination :: %s:%u-%u\t%s\n", ii, posGt->_chr, posGt->_rStart, posGt->_rEnd, seq.c_str());
// fprintf(stderr, "\n[ DEBUG ]:: score %d th combination :: %s:%u-%u\t%s\n", ii, posGt->_chr, posGt->_rStart, posGt->_rEnd, seq.c_str());

idx = 0;

Expand Down Expand Up @@ -128,7 +128,6 @@ varMer::score(merfinGlobal *g) {
// store new k*
m_ks.push_back(kMetric);
// fprintf(stderr, "[ DEBUG ] :: push_back kMetric for idx: %u. Kr: %.3f Ka: %.0f K*: %.3f\n\n", idx, readK, asmK, kMetric);

// store the delta k* when the variant is applied
m_dks.push_back(oDeltak-nDeltak);

Expand All @@ -155,7 +154,6 @@ varMer::bestFilter() {
vector<vcfRecord*> records; // empty vector with no records

for ( int ii = 0; ii < numMs.size(); ii++ ) {

// ignore when all kmers are 'missings'
if ( numMs.at(ii) == seqs.at(ii).size() - kmer::merSize() + 1) continue;

Expand Down Expand Up @@ -200,6 +198,205 @@ varMer::bestFilter() {
return records;
}

/***
* Get the better variant, report HOM only if the variant reduces num. missing kmers
***/
string
varMer::betterVariant() {
uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings
vector<int> idxs;
// fprintf(stderr, "[ DEBUG ] :: Better mode called\n");

// add reference path as default
if ( numMs.size() == 0 ) return "";
uint32 refMissing = numMs.at(0);
numMissing = refMissing;

// only add the variant if the numMs are less or equal to the ref path
for ( int ii = 0; ii < numMs.size(); ii++ ) {

// lowest num. missings? add to report
if ( numMs.at(ii) < numMissing ) {
numMissing = numMs.at(ii);
idxs.clear();
idxs.push_back(ii);

} else if ( numMs.at(ii) == numMissing && numMs.at(ii) < refMissing ) {
// has the same numMissing, less than refMissing
idxs.push_back(ii);

} // else : ignore

}

// ignore if all kmers only increase missing kmers
if ( idxs.size() == 0 ) return "";

int idx = idxs.at(0); // path index
// report only one best
// select the longest alt if there are multiple bests
//
// 1) only one combination has the minimum num. of missings
if ( idxs.size() == 1 ) {
// return records as HOM
return getHomRecord(idx);
}
// 2) multiple combinations with equal num. of missings
else {
// get the path idx with longest seq length
uint32 seqLenMax = seqs.at(idx).size();
for ( int ii = 1; ii < idxs.size(); ii++ ) {
uint32 seqLen = seqs.at(idxs.at(ii)).length();
if ( seqLen > seqLenMax ) {
seqLenMax = seqLen;
idx = idxs.at(ii);
}
}
// return records as HOM
return getHomRecord(idx);
}
}

string
varMer::strictPolish() {
uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings
vector<int> idxs;

// add reference path as default
if ( numMs.size() == 0 ) return "";
uint32 refMissing = numMs.at(0);
numMissing = refMissing;

// only add the variant if the numMs are less or equal to the ref path
for ( int ii = 0; ii < numMs.size(); ii++ ) {

// lowest num. missings? add to report
if ( numMs.at(ii) < numMissing ) {
numMissing = numMs.at(ii);
idxs.clear();
idxs.push_back(ii);

} else if ( numMs.at(ii) == numMissing && numMs.at(ii) < refMissing ) {
// has the same numMissing, less than refMissing
idxs.push_back(ii);

} // else : ignore

}

// ignore if all kmers only increase missing kmers
if ( idxs.size() == 0 ) return "";

list<int> gtIdxs;
int idx = idxs.at(0); // path index
// report only one best
// select the longest alt if there are multiple bests
//
// 1) only one combination has the minimum num. of missings
if ( idxs.size() == 1 ) {
// return records as HOM
return getHomRecord(idx);
}
// 2) multiple combinations with equal num. of missings
else {
// TODO: Adjust this part
// get the path idx with longest seq length
uint32 seqLenMax = seqs.at(idx).size();
for ( int ii = 1; ii < idxs.size(); ii++ ) {
uint32 seqLen = seqs.at(idxs.at(ii)).length();
if ( seqLen > seqLenMax ) {
seqLenMax = seqLen;
idx = idxs.at(ii);
}
}
// return records as HOM
return getHomRecord(idx);
}
}

string
varMer::loosePolish() {

// fprintf(stderr, "[ DEBUG ] :: Loose mode called\n");

uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings
vector<int> idxs;

// add reference path as default
if ( numMs.size() == 0 ) return "";

uint32 refMissing = numMs.at(0);
numMissing = refMissing;

// only add the variant if the numMs are less or equal to the ref path
for ( int ii = 0; ii < numMs.size(); ii++ ) {

// lowest num. missings? add to report
if ( numMs.at(ii) < numMissing ) {
numMissing = numMs.at(ii);
idxs.clear();
idxs.push_back(ii);

} else if ( numMs.at(ii) == numMissing && numMs.at(ii) <= refMissing ) {
// has the same numMissing, less than refMissing
idxs.push_back(ii);

} // else : ignore

}

// fprintf(stderr, "[ DEBUG ] :: num. combinations picked up: %lu at Mmin %u\n", idxs.size(), numMissing);

// ignore if all kmers only increase missing kmers
if ( idxs.size() == 0 ) return "";

int idx = idxs.at(0); // path index
// report all, with warnings for multiple pathes

// 1) only one combination has the minimum num. of missings
if ( idxs.size() == 1 ) {
// return records as HOM
return getHomRecord(idx);
}
// 2) multiple combinations with equal num. of missings
else {

// only 2 pathes exist, first is the REF path
if (idxs.at(0) == 0 && idxs.size() == 2)
return getHomRecord(idxs.at(1));

// find the path with most ALT variants
int maxVars = 0;
int maxIdx = idx;

// for each pathes
for ( int ii = 1; ii < idxs.size(); ii++) {
int count = 0;
idx = idxs.at(ii);

// check if each gt is ALT
for ( uint64 i = 0; i < gtPaths.at(idx).size(); i++) {
int altIdx = gtPaths[idx][i];
if (altIdx > 0) {
count++;
}
}

if (count > maxVars) {
maxVars = count;
maxIdx = idx;
}
}

fprintf(stderr, "[ WARNING ] :: Multiple (%lu) alternate pathes detected in a path beginning with variant : %s", idxs.size(), posGt->_gts[0]->_record->save().c_str());
fprintf(stderr, "[ WARNING ] :: Max. %d ALT variants selected\n", maxVars);
return getHomRecord(maxIdx);
}
}

/***
* -polish mode. Use k*
*/
string
varMer::bestVariant() {

Expand Down
3 changes: 3 additions & 0 deletions src/merfin/varMer.H
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ public:

public:
string bestVariant();
string betterVariant(); // Report variants only if it reduces num. missings; regardless of k*; Taking longer pathes for ties
string strictPolish(); // Report variants only if it reduces num. missings; regardless of k*; Taking minimum edits
string loosePolish(); // Remove variants only if it increases num. missings; regardless of k*; Taking maximum edits
vector<vcfRecord*> bestFilter();

private:
Expand Down
2 changes: 1 addition & 1 deletion src/meryl
2 changes: 1 addition & 1 deletion src/utility

0 comments on commit 39806a7

Please sign in to comment.