Skip to content

Commit

Permalink
change -vmer to -filter and -polish, make -seqmer optional
Browse files Browse the repository at this point in the history
  • Loading branch information
arangrhie committed Jun 25, 2021
1 parent bc5fd39 commit 400b4df
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 109 deletions.
61 changes: 48 additions & 13 deletions src/merfin/merfin-globals.C
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

#include "merfin-globals.H"
#include "strings.H"

#include <libgen.h>

// Read probabilities lookup table for 1-4 copy kmers.
void
Expand All @@ -24,10 +24,10 @@ merfinGlobal::load_Kmetric(void) {
if (pLookupTable == nullptr) // No input supplied, no lookup table to make.
return;

fprintf(stderr, "-- Loading copy-number lookup table '%s'.\n\n", pLookupTable);
fprintf(stderr, "-- Loading probability table '%s'.\n\n", pLookupTable);

if (fileExists(pLookupTable) == false) {
fprintf(stderr, "ERROR: Lookup table (-lookup) file '%s' doesn't exist!\n", pLookupTable);
fprintf(stderr, "ERROR: Probability table (-prob) file '%s' doesn't exist!\n", pLookupTable);
exit(1);
}

Expand Down Expand Up @@ -119,20 +119,26 @@ merfinGlobal::load_Kmers(void) {
return;
}

merylFileReader* readDB = new merylFileReader(readDBname);
merylFileReader* asmDB = new merylFileReader(seqDBname);

double minMem, minMemTotal = 0;
double optMem, optMemTotal = 0;
bool useOpt = false;
bool useMin = false;

// Make readDB first so we know the k size
merylFileReader* readDB = new merylFileReader(readDBname);

// Open sequence and build seqDBname if not provided
load_Sequence();

fprintf(stderr, "-- Estimating required space for loading '%s'\n", readDBname);
readLookup = new merylExactLookup();
readLookup->estimateMemoryUsage(readDB, maxMemory, minMem, optMem, minV, maxV);
minMemTotal += minMem;
optMemTotal += optMem;

asmLookup = new merylExactLookup();
fprintf(stderr, "-- Estimating required space for loading '%s'\n", seqDBname);
merylFileReader* asmDB = new merylFileReader(seqDBname);
asmLookup = new merylExactLookup();
asmLookup->estimateMemoryUsage(asmDB, maxMemory, minMem, optMem, minV, maxV);
minMemTotal += minMem;
optMemTotal += optMem;
Expand Down Expand Up @@ -166,27 +172,56 @@ merfinGlobal::load_Kmers(void) {
delete asmDB;
}




void
merfinGlobal::open_Inputs(void) {
merfinGlobal::load_Sequence(void) {

if (reportType == OP_COMPL) {
return;
}

// Open input sequences.
if (seqDBname == nullptr) {
seqDBname = new char[FILENAME_MAX+1];
snprintf(seqDBname, FILENAME_MAX, "%s.meryl", basename(seqName));
fprintf(stderr, "-- No -seqmer given. Build sequence db as '%s'.\n", seqDBname);

// TODO: Replace this hacky counting when meryl has proper APIs
char merylcount[FILENAME_MAX+1];
char merylpath[FILENAME_MAX+1];

if (strcmp(execName, "merfin") == 0)
snprintf(merylpath, FILENAME_MAX, "meryl");
else
snprintf(merylpath, FILENAME_MAX, "%s/meryl", dirname(execName));

snprintf(merylcount, FILENAME_MAX, "%s count k=%d memory=%.3f %s output %s", merylpath, kmer::merSize(), maxMemory, seqName, seqDBname);

fprintf(stderr, "%s\n\n", merylcount);
system(merylcount);
fprintf(stderr, "\n");

}

// Open input sequence.
if (seqName != nullptr) {
fprintf(stderr, "-- Opening sequences in '%s'.\n", seqName);
seqFile = new dnaSeqFile(seqName);
}

}



void
merfinGlobal::open_Inputs(void) {

if (reportType == OP_COMPL) {
return;
}

// Open VCF input. This is only needed for reportType OP_VAR_MER.
// main() checks that we have a vcfName.

if (reportType == OP_VAR_MER) {
if (reportType == OP_FILTER || reportType == OP_POLISH) {
fprintf(stderr, "-- Opening vcf file '%s'.\n", vcfName);
inVcf = new vcfFile(vcfName);

Expand Down
19 changes: 11 additions & 8 deletions src/merfin/merfin-globals.H
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@

#define OP_NONE 0
#define OP_HIST 1
#define OP_DUMP 2
#define OP_VAR_MER 3
#define OP_COMPL 4

#define OP_COMPL 2
#define OP_DUMP 3
#define OP_FILTER 4
#define OP_POLISH 5


////////////////////////////////////////
Expand Down Expand Up @@ -129,6 +129,10 @@ public:
//
class merfinGlobal {
public:
merfinGlobal(char *const execname) {
execName = execname;
}

~merfinGlobal() {
delete dumpOutFile;
delete oVCF;
Expand All @@ -145,6 +149,7 @@ public:

public:
void load_Kmetric(void);
void load_Sequence(void);
void load_Kmers(void);
void open_Inputs(void);

Expand All @@ -171,9 +176,6 @@ public:
compressedFileWriter *oVCF = nullptr;





// Output data for histogram and data modes. Data mode only uses histKasm
// and histKmissing.
public:
Expand Down Expand Up @@ -234,8 +236,9 @@ public:
public:
double peak = 0;
bool nosplit = false;
bool bykstar = true;
uint32 comb = 15;

char *execName = nullptr;
};


Expand Down
6 changes: 3 additions & 3 deletions src/merfin/merfin-variants.C
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,14 @@ processVariants(void *G, void *T, void *S) {
// Generate output VCFs.

// Experimental: output vcf according to k*
if (g->bykstar) {
if (g->reportType == OP_POLISH) {
//fprintf(t->oVcf->file(), "%s", seqMer->bestVariant().c_str());
s->result += seqMer->bestVariant();
}

// Filter vcf and print as it was in the original vcf, conservatively
else {
vector<vcfRecord*> records = seqMer->bestVariantOriginalVCF();
vector<vcfRecord*> records = seqMer->bestFilter();

for (uint64 i = 0; i < records.size(); i++)
//records[i]->save(t->oVcf);
Expand Down Expand Up @@ -316,7 +316,7 @@ outputVariants(void *G, void *S) {
if (g->oVCF == nullptr) {
char name[FILENAME_MAX+1];

if (g->bykstar)
if (g->reportType == OP_POLISH)
snprintf(name, FILENAME_MAX, "%s.polish.vcf", g->outName);
else
snprintf(name, FILENAME_MAX, "%s.filter.vcf", g->outName);
Expand Down
Loading

0 comments on commit 400b4df

Please sign in to comment.