From cd1df51b20c8c1c4ae9a542bcf03b2d871ad29bc Mon Sep 17 00:00:00 2001 From: arangrhie Date: Mon, 10 Jan 2022 10:27:27 -0500 Subject: [PATCH 1/3] update submodules --- src/meryl | 2 +- src/utility | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/meryl b/src/meryl index 51fad4b..d8f336f 160000 --- a/src/meryl +++ b/src/meryl @@ -1 +1 @@ -Subproject commit 51fad4b236a7e22450cd23d405d83921572f5a50 +Subproject commit d8f336f791b92c2a265e70bba9f995621b67d195 diff --git a/src/utility b/src/utility index c58d0a3..4f84efb 160000 --- a/src/utility +++ b/src/utility @@ -1 +1 @@ -Subproject commit c58d0a3635175b35711508c8bf567c9c096ad1e8 +Subproject commit 4f84efbb0c907b14f5381fca2356bb11a0a12f53 From 6bb3b375936424cc0452c79d5a6c9f04a91bafd1 Mon Sep 17 00:00:00 2001 From: arangrhie Date: Mon, 10 Jan 2022 10:29:24 -0500 Subject: [PATCH 2/3] -better mode --- src/merfin/merfin-globals.C | 2 +- src/merfin/merfin-globals.H | 2 +- src/merfin/merfin-variants.C | 6 +++- src/merfin/merfin.C | 6 +++- src/merfin/varMer.C | 60 +++++++++++++++++++++++++++++++++++- src/merfin/varMer.H | 1 + 6 files changed, 72 insertions(+), 5 deletions(-) diff --git a/src/merfin/merfin-globals.C b/src/merfin/merfin-globals.C index e41cbd0..ef6c05b 100644 --- a/src/merfin/merfin-globals.C +++ b/src/merfin/merfin-globals.C @@ -208,7 +208,7 @@ merfinGlobal::open_Inputs(void) { // Open VCF input. This is only needed for reportType OP_VAR_MER. // main() checks that we have a vcfName. - if (reportType == OP_FILTER || reportType == OP_POLISH) { + if (reportType == OP_FILTER || reportType == OP_POLISH || reportType == OP_BETTER) { fprintf(stderr, "-- Opening vcf file '%s'.\n", vcfName); inVcf = new vcfFile(vcfName); diff --git a/src/merfin/merfin-globals.H b/src/merfin/merfin-globals.H index f6b7b31..1f85342 100644 --- a/src/merfin/merfin-globals.H +++ b/src/merfin/merfin-globals.H @@ -33,7 +33,7 @@ #define OP_DUMP 3 #define OP_FILTER 4 #define OP_POLISH 5 - +#define OP_BETTER 6 //////////////////////////////////////// // diff --git a/src/merfin/merfin-variants.C b/src/merfin/merfin-variants.C index d37aaa7..e72ddc5 100644 --- a/src/merfin/merfin-variants.C +++ b/src/merfin/merfin-variants.C @@ -277,10 +277,14 @@ processVariants(void *G, void *T, void *S) { // Generate output VCFs. - // Experimental: output vcf according to k* + // Best: output vcf according to k* if (g->reportType == OP_POLISH) { //fprintf(t->oVcf->file(), "%s", seqMer->bestVariant().c_str()); s->result += seqMer->bestVariant(); + + // Better: output vcf for reducing missing kmers only + } else if (g->reportType == OP_BETTER) { + s->result += seqMer->betterVariant(); } // Filter vcf and print as it was in the original vcf, conservatively diff --git a/src/merfin/merfin.C b/src/merfin/merfin.C index afdb859..c467384 100644 --- a/src/merfin/merfin.C +++ b/src/merfin/merfin.C @@ -116,6 +116,9 @@ main(int32 argc, char **argv) { } else if (strcmp(argv[arg], "-filter") == 0) { G->reportType = OP_FILTER; + } else if (strcmp(argv[arg], "-better") == 0) { + G->reportType = OP_BETTER; + } else if (strcmp(argv[arg], "-polish") == 0) { G->reportType = OP_POLISH; @@ -149,12 +152,13 @@ main(int32 argc, char **argv) { if ((G->reportType == OP_HIST) || (G->reportType == OP_DUMP) || (G->reportType == OP_POLISH) || + (G->reportType == OP_BETTER) || (G->reportType == OP_FILTER)) { if (G->seqName == nullptr) err.push_back("No input sequences (-sequence) supplied.\n"); if (G->outName == nullptr) err.push_back("No output (-output) supplied.\n"); } - if (G->reportType == OP_POLISH || G->reportType == OP_FILTER) { + if (G->reportType == OP_POLISH || G->reportType == OP_FILTER || G->reportType == OP_BETTER) { if (G->vcfName == nullptr) err.push_back("No variant call input (-vcf) supplied; mandatory for -filter or -polish.\n"); } diff --git a/src/merfin/varMer.C b/src/merfin/varMer.C index 77b1c19..75788f2 100644 --- a/src/merfin/varMer.C +++ b/src/merfin/varMer.C @@ -155,7 +155,6 @@ varMer::bestFilter() { vector records; // empty vector with no records for ( int ii = 0; ii < numMs.size(); ii++ ) { - // ignore when all kmers are 'missings' if ( numMs.at(ii) == seqs.at(ii).size() - kmer::merSize() + 1) continue; @@ -200,6 +199,65 @@ varMer::bestFilter() { return records; } +/*** + * Get the better variant, report HOM only if the variant reduces num. missing kmers + ***/ +string +varMer::betterVariant() { + uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings + vector idxs; + + // add reference path as default + if ( numMs.size() == 0 ) return ""; + uint32 refMissing = numMs.at(0); + numMissing = refMissing; + + // only add the variant if the numMs are less or equal to the ref path + for ( int ii = 0; ii < numMs.size(); ii++ ) { + + // lowest num. missings? add to report + if ( numMs.at(ii) < numMissing ) { + numMissing = numMs.at(ii); + idxs.clear(); + idxs.push_back(ii); + + } else if ( numMs.at(ii) == numMissing && numMs.at(ii) < refMissing ) { + // has the same numMissing, less than refMissing + idxs.push_back(ii); + + } // else : ignore + + } + + // ignore if all kmers only increase missing kmers + if ( idxs.size() == 0 ) return ""; + + list gtIdxs; + int idx = idxs.at(0); // path index + // report only one best + // select the longest alt if there are multiple bests + // + // 1) only one combination has the minimum num. of missings + if ( idxs.size() == 1 ) { + // return records as HOM + return getHomRecord(idx); + } + // 2) multiple combinations with equal num. of missings + else { + // get the path idx with longest seq length + uint32 seqLenMax = seqs.at(idx).size(); + for ( int ii = 1; ii < idxs.size(); ii++ ) { + uint32 seqLen = seqs.at(idxs.at(ii)).length(); + if ( seqLen > seqLenMax ) { + seqLenMax = seqLen; + idx = idxs.at(ii); + } + } + // return records as HOM + return getHomRecord(idx); + } +} + string varMer::bestVariant() { diff --git a/src/merfin/varMer.H b/src/merfin/varMer.H index 8488391..fe1dbb7 100644 --- a/src/merfin/varMer.H +++ b/src/merfin/varMer.H @@ -41,6 +41,7 @@ public: public: string bestVariant(); + string betterVariant(); // Report variants only if it reduces num. missings; regardless of k* vector bestFilter(); private: From 26b14c1cf1ecf539f2e6cd09d354209c0855bb59 Mon Sep 17 00:00:00 2001 From: arangrhie Date: Thu, 1 Dec 2022 21:15:32 -0500 Subject: [PATCH 3/3] add -loose mode --- src/merfin/merfin-globals.C | 2 +- src/merfin/merfin-globals.H | 2 + src/merfin/merfin-variants.C | 17 +++-- src/merfin/merfin.C | 17 +++-- src/merfin/varMer.C | 143 ++++++++++++++++++++++++++++++++++- src/merfin/varMer.H | 4 +- 6 files changed, 166 insertions(+), 19 deletions(-) diff --git a/src/merfin/merfin-globals.C b/src/merfin/merfin-globals.C index ef6c05b..329492d 100644 --- a/src/merfin/merfin-globals.C +++ b/src/merfin/merfin-globals.C @@ -208,7 +208,7 @@ merfinGlobal::open_Inputs(void) { // Open VCF input. This is only needed for reportType OP_VAR_MER. // main() checks that we have a vcfName. - if (reportType == OP_FILTER || reportType == OP_POLISH || reportType == OP_BETTER) { + if (reportType == OP_FILTER || reportType == OP_POLISH || reportType == OP_BETTER || reportType == OP_STRICT || reportType == OP_LOOSE) { fprintf(stderr, "-- Opening vcf file '%s'.\n", vcfName); inVcf = new vcfFile(vcfName); diff --git a/src/merfin/merfin-globals.H b/src/merfin/merfin-globals.H index 1f85342..b55ff86 100644 --- a/src/merfin/merfin-globals.H +++ b/src/merfin/merfin-globals.H @@ -34,6 +34,8 @@ #define OP_FILTER 4 #define OP_POLISH 5 #define OP_BETTER 6 +#define OP_STRICT 7 +#define OP_LOOSE 8 //////////////////////////////////////// // diff --git a/src/merfin/merfin-variants.C b/src/merfin/merfin-variants.C index e72ddc5..a4c8478 100644 --- a/src/merfin/merfin-variants.C +++ b/src/merfin/merfin-variants.C @@ -285,10 +285,17 @@ processVariants(void *G, void *T, void *S) { // Better: output vcf for reducing missing kmers only } else if (g->reportType == OP_BETTER) { s->result += seqMer->betterVariant(); - } + + // Strict: output vcf for reducing missing kmers only + } else if (g->reportType == OP_STRICT) { + s->result += seqMer->strictPolish(); + + // Loose: remove variants increasing missing kmers only + } else if (g->reportType == OP_LOOSE) { + s->result += seqMer->loosePolish(); // Filter vcf and print as it was in the original vcf, conservatively - else { + } else { vector records = seqMer->bestFilter(); for (uint64 i = 0; i < records.size(); i++) @@ -297,17 +304,11 @@ processVariants(void *G, void *T, void *S) { } // Cleanup. - delete seqMer; delete[] refTemplate; } // Over posGTlist. } - - - - - void outputVariants(void *G, void *S) { merfinGlobal *g = (merfinGlobal *)G; diff --git a/src/merfin/merfin.C b/src/merfin/merfin.C index c467384..a462179 100644 --- a/src/merfin/merfin.C +++ b/src/merfin/merfin.C @@ -119,6 +119,13 @@ main(int32 argc, char **argv) { } else if (strcmp(argv[arg], "-better") == 0) { G->reportType = OP_BETTER; + } else if (strcmp(argv[arg], "-strict") == 0) { + G->reportType = OP_STRICT; + + } else if (strcmp(argv[arg], "-loose") == 0) { + fprintf(stderr, "*EXPERIMENTAL* Running in -loose mode\n"); + G->reportType = OP_LOOSE; + } else if (strcmp(argv[arg], "-polish") == 0) { G->reportType = OP_POLISH; @@ -149,16 +156,12 @@ main(int32 argc, char **argv) { // Check inputs are present for the various modes. - if ((G->reportType == OP_HIST) || - (G->reportType == OP_DUMP) || - (G->reportType == OP_POLISH) || - (G->reportType == OP_BETTER) || - (G->reportType == OP_FILTER)) { + if (G->reportType != OP_COMPL) { if (G->seqName == nullptr) err.push_back("No input sequences (-sequence) supplied.\n"); if (G->outName == nullptr) err.push_back("No output (-output) supplied.\n"); } - if (G->reportType == OP_POLISH || G->reportType == OP_FILTER || G->reportType == OP_BETTER) { + if (G->reportType == OP_POLISH || G->reportType == OP_FILTER || G->reportType == OP_BETTER || G->reportType == OP_STRICT || G->reportType == OP_LOOSE) { if (G->vcfName == nullptr) err.push_back("No variant call input (-vcf) supplied; mandatory for -filter or -polish.\n"); } @@ -330,7 +333,7 @@ main(int32 argc, char **argv) { ss->setInOrderOutput(true); } - if (G->reportType == OP_POLISH || G->reportType == OP_FILTER) { + if (G->reportType != OP_HIST && G->reportType != OP_DUMP && G->reportType != OP_COMPL) { fprintf(stderr, "-- Generate variant mers and score them.\n"); ss = new sweatShop(loadSequence, processVariants, outputVariants); ss->setInOrderOutput(false); diff --git a/src/merfin/varMer.C b/src/merfin/varMer.C index 75788f2..8c392db 100644 --- a/src/merfin/varMer.C +++ b/src/merfin/varMer.C @@ -69,7 +69,7 @@ varMer::score(merfinGlobal *g) { seq = seqs.at(ii); m_ks.clear(); m_dks.clear(); - // fprintf(stderr, "\n[ DEBUG ]:: score %d th combination :: %s:%u-%u\t%s\n", ii, posGt->_chr, posGt->_rStart, posGt->_rEnd, seq.c_str()); + // fprintf(stderr, "\n[ DEBUG ]:: score %d th combination :: %s:%u-%u\t%s\n", ii, posGt->_chr, posGt->_rStart, posGt->_rEnd, seq.c_str()); idx = 0; @@ -128,7 +128,6 @@ varMer::score(merfinGlobal *g) { // store new k* m_ks.push_back(kMetric); // fprintf(stderr, "[ DEBUG ] :: push_back kMetric for idx: %u. Kr: %.3f Ka: %.0f K*: %.3f\n\n", idx, readK, asmK, kMetric); - // store the delta k* when the variant is applied m_dks.push_back(oDeltak-nDeltak); @@ -206,6 +205,62 @@ string varMer::betterVariant() { uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings vector idxs; + // fprintf(stderr, "[ DEBUG ] :: Better mode called\n"); + + // add reference path as default + if ( numMs.size() == 0 ) return ""; + uint32 refMissing = numMs.at(0); + numMissing = refMissing; + + // only add the variant if the numMs are less or equal to the ref path + for ( int ii = 0; ii < numMs.size(); ii++ ) { + + // lowest num. missings? add to report + if ( numMs.at(ii) < numMissing ) { + numMissing = numMs.at(ii); + idxs.clear(); + idxs.push_back(ii); + + } else if ( numMs.at(ii) == numMissing && numMs.at(ii) < refMissing ) { + // has the same numMissing, less than refMissing + idxs.push_back(ii); + + } // else : ignore + + } + + // ignore if all kmers only increase missing kmers + if ( idxs.size() == 0 ) return ""; + + int idx = idxs.at(0); // path index + // report only one best + // select the longest alt if there are multiple bests + // + // 1) only one combination has the minimum num. of missings + if ( idxs.size() == 1 ) { + // return records as HOM + return getHomRecord(idx); + } + // 2) multiple combinations with equal num. of missings + else { + // get the path idx with longest seq length + uint32 seqLenMax = seqs.at(idx).size(); + for ( int ii = 1; ii < idxs.size(); ii++ ) { + uint32 seqLen = seqs.at(idxs.at(ii)).length(); + if ( seqLen > seqLenMax ) { + seqLenMax = seqLen; + idx = idxs.at(ii); + } + } + // return records as HOM + return getHomRecord(idx); + } +} + +string +varMer::strictPolish() { + uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings + vector idxs; // add reference path as default if ( numMs.size() == 0 ) return ""; @@ -244,6 +299,7 @@ varMer::betterVariant() { } // 2) multiple combinations with equal num. of missings else { + // TODO: Adjust this part // get the path idx with longest seq length uint32 seqLenMax = seqs.at(idx).size(); for ( int ii = 1; ii < idxs.size(); ii++ ) { @@ -258,6 +314,89 @@ varMer::betterVariant() { } } +string +varMer::loosePolish() { + + // fprintf(stderr, "[ DEBUG ] :: Loose mode called\n"); + + uint32 numMissing = UINT32_MAX; // actual minimum number of missing kmers in the combination with minimum missings + vector idxs; + + // add reference path as default + if ( numMs.size() == 0 ) return ""; + + uint32 refMissing = numMs.at(0); + numMissing = refMissing; + + // only add the variant if the numMs are less or equal to the ref path + for ( int ii = 0; ii < numMs.size(); ii++ ) { + + // lowest num. missings? add to report + if ( numMs.at(ii) < numMissing ) { + numMissing = numMs.at(ii); + idxs.clear(); + idxs.push_back(ii); + + } else if ( numMs.at(ii) == numMissing && numMs.at(ii) <= refMissing ) { + // has the same numMissing, less than refMissing + idxs.push_back(ii); + + } // else : ignore + + } + + // fprintf(stderr, "[ DEBUG ] :: num. combinations picked up: %lu at Mmin %u\n", idxs.size(), numMissing); + + // ignore if all kmers only increase missing kmers + if ( idxs.size() == 0 ) return ""; + + int idx = idxs.at(0); // path index + // report all, with warnings for multiple pathes + + // 1) only one combination has the minimum num. of missings + if ( idxs.size() == 1 ) { + // return records as HOM + return getHomRecord(idx); + } + // 2) multiple combinations with equal num. of missings + else { + + // only 2 pathes exist, first is the REF path + if (idxs.at(0) == 0 && idxs.size() == 2) + return getHomRecord(idxs.at(1)); + + // find the path with most ALT variants + int maxVars = 0; + int maxIdx = idx; + + // for each pathes + for ( int ii = 1; ii < idxs.size(); ii++) { + int count = 0; + idx = idxs.at(ii); + + // check if each gt is ALT + for ( uint64 i = 0; i < gtPaths.at(idx).size(); i++) { + int altIdx = gtPaths[idx][i]; + if (altIdx > 0) { + count++; + } + } + + if (count > maxVars) { + maxVars = count; + maxIdx = idx; + } + } + + fprintf(stderr, "[ WARNING ] :: Multiple (%lu) alternate pathes detected in a path beginning with variant : %s", idxs.size(), posGt->_gts[0]->_record->save().c_str()); + fprintf(stderr, "[ WARNING ] :: Max. %d ALT variants selected\n", maxVars); + return getHomRecord(maxIdx); + } +} + +/*** + * -polish mode. Use k* + */ string varMer::bestVariant() { diff --git a/src/merfin/varMer.H b/src/merfin/varMer.H index fe1dbb7..935a94a 100644 --- a/src/merfin/varMer.H +++ b/src/merfin/varMer.H @@ -41,7 +41,9 @@ public: public: string bestVariant(); - string betterVariant(); // Report variants only if it reduces num. missings; regardless of k* + string betterVariant(); // Report variants only if it reduces num. missings; regardless of k*; Taking longer pathes for ties + string strictPolish(); // Report variants only if it reduces num. missings; regardless of k*; Taking minimum edits + string loosePolish(); // Remove variants only if it increases num. missings; regardless of k*; Taking maximum edits vector bestFilter(); private: