Skip to content

Commit

Permalink
-better mode
Browse files Browse the repository at this point in the history
  • Loading branch information
arangrhie committed Jan 10, 2022
1 parent cd1df51 commit 6bb3b37
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/merfin/merfin-globals.C
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ merfinGlobal::open_Inputs(void) {
// Open VCF input. This is only needed for reportType OP_VAR_MER.
// main() checks that we have a vcfName.

if (reportType == OP_FILTER || reportType == OP_POLISH) {
if (reportType == OP_FILTER || reportType == OP_POLISH || reportType == OP_BETTER) {
fprintf(stderr, "-- Opening vcf file '%s'.\n", vcfName);
inVcf = new vcfFile(vcfName);

Expand Down
2 changes: 1 addition & 1 deletion src/merfin/merfin-globals.H
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
#define OP_DUMP 3
#define OP_FILTER 4
#define OP_POLISH 5

#define OP_BETTER 6

////////////////////////////////////////
//
Expand Down
6 changes: 5 additions & 1 deletion src/merfin/merfin-variants.C
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,14 @@ processVariants(void *G, void *T, void *S) {

// Generate output VCFs.

// Experimental: output vcf according to k*
// Best: output vcf according to k*
if (g->reportType == OP_POLISH) {
//fprintf(t->oVcf->file(), "%s", seqMer->bestVariant().c_str());
s->result += seqMer->bestVariant();

// Better: output vcf for reducing missing kmers only
} else if (g->reportType == OP_BETTER) {
s->result += seqMer->betterVariant();
}

// Filter vcf and print as it was in the original vcf, conservatively
Expand Down
6 changes: 5 additions & 1 deletion src/merfin/merfin.C
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ main(int32 argc, char **argv) {
} else if (strcmp(argv[arg], "-filter") == 0) {
G->reportType = OP_FILTER;

} else if (strcmp(argv[arg], "-better") == 0) {
G->reportType = OP_BETTER;

} else if (strcmp(argv[arg], "-polish") == 0) {
G->reportType = OP_POLISH;

Expand Down Expand Up @@ -149,12 +152,13 @@ main(int32 argc, char **argv) {
if ((G->reportType == OP_HIST) ||
(G->reportType == OP_DUMP) ||
(G->reportType == OP_POLISH) ||
(G->reportType == OP_BETTER) ||
(G->reportType == OP_FILTER)) {
if (G->seqName == nullptr) err.push_back("No input sequences (-sequence) supplied.\n");
if (G->outName == nullptr) err.push_back("No output (-output) supplied.\n");
}

if (G->reportType == OP_POLISH || G->reportType == OP_FILTER) {
if (G->reportType == OP_POLISH || G->reportType == OP_FILTER || G->reportType == OP_BETTER) {
if (G->vcfName == nullptr) err.push_back("No variant call input (-vcf) supplied; mandatory for -filter or -polish.\n");
}

Expand Down
60 changes: 59 additions & 1 deletion src/merfin/varMer.C
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ varMer::bestFilter() {
vector<vcfRecord*> records; // empty vector with no records

for ( int ii = 0; ii < numMs.size(); ii++ ) {

// ignore when all kmers are 'missings'
if ( numMs.at(ii) == seqs.at(ii).size() - kmer::merSize() + 1) continue;

Expand Down Expand Up @@ -200,6 +199,65 @@ varMer::bestFilter() {
return records;
}

/***
* Get the better variant, report HOM only if the variant reduces num. missing kmers
***/
string
varMer::betterVariant() {
uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings
vector<int> idxs;

// add reference path as default
if ( numMs.size() == 0 ) return "";
uint32 refMissing = numMs.at(0);
numMissing = refMissing;

// only add the variant if the numMs are less or equal to the ref path
for ( int ii = 0; ii < numMs.size(); ii++ ) {

// lowest num. missings? add to report
if ( numMs.at(ii) < numMissing ) {
numMissing = numMs.at(ii);
idxs.clear();
idxs.push_back(ii);

} else if ( numMs.at(ii) == numMissing && numMs.at(ii) < refMissing ) {
// has the same numMissing, less than refMissing
idxs.push_back(ii);

} // else : ignore

}

// ignore if all kmers only increase missing kmers
if ( idxs.size() == 0 ) return "";

list<int> gtIdxs;
int idx = idxs.at(0); // path index
// report only one best
// select the longest alt if there are multiple bests
//
// 1) only one combination has the minimum num. of missings
if ( idxs.size() == 1 ) {
// return records as HOM
return getHomRecord(idx);
}
// 2) multiple combinations with equal num. of missings
else {
// get the path idx with longest seq length
uint32 seqLenMax = seqs.at(idx).size();
for ( int ii = 1; ii < idxs.size(); ii++ ) {
uint32 seqLen = seqs.at(idxs.at(ii)).length();
if ( seqLen > seqLenMax ) {
seqLenMax = seqLen;
idx = idxs.at(ii);
}
}
// return records as HOM
return getHomRecord(idx);
}
}

string
varMer::bestVariant() {

Expand Down
1 change: 1 addition & 0 deletions src/merfin/varMer.H
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public:

public:
string bestVariant();
string betterVariant(); // Report variants only if it reduces num. missings; regardless of k*
vector<vcfRecord*> bestFilter();

private:
Expand Down

0 comments on commit 6bb3b37

Please sign in to comment.