Skip to content

Commit

Permalink
Fix masking issue in Index when excluding k-mer
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Jan 12, 2025
1 parent 934db4f commit 4766f92
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 93 deletions.
88 changes: 39 additions & 49 deletions src/prefiltering/IndexBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ class DbInfo {
};


void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup,
SequenceLookup **unmaskedLookup,BaseMatrix &subMat,
void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup ** externalLookup, BaseMatrix &subMat,
ScoreMatrix & three, ScoreMatrix & two, Sequence *seq,
DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode) {
Expand All @@ -65,27 +64,14 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
size_t dbSize = dbTo - dbFrom;
DbInfo* info = new DbInfo(dbFrom, dbTo, seq->getEffectiveKmerSize(), *dbr);

SequenceLookup *sequenceLookup;
if (unmaskedLookup != NULL && maskedLookup == NULL) {
*unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
sequenceLookup = *unmaskedLookup;
} else if (unmaskedLookup == NULL && maskedLookup != NULL) {
*maskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
sequenceLookup = *maskedLookup;
} else if (unmaskedLookup != NULL && maskedLookup != NULL) {
*unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
*maskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
sequenceLookup = *maskedLookup;
} else{
Debug(Debug::ERROR) << "This should not happen\n";
EXIT(EXIT_FAILURE);
}
*externalLookup = new SequenceLookup(dbSize, info->aaDbSize);
SequenceLookup *sequenceLookup = *externalLookup;


// identical scores for memory reduction code
char *idScoreLookup = getScoreLookup(subMat);
Debug::Progress progress(dbTo-dbFrom);

bool needMasking = (mask == 1 || maskNrepeats > 0 || maskLowerCaseMode == 1);
size_t maskedResidues = 0;
size_t totalKmerCount = 0;
#pragma omp parallel
Expand All @@ -96,16 +82,17 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
#endif
// need to prune low scoring k-mers through masking
Masker *masker = NULL;
if (maskedLookup != NULL) {
if (needMasking) {
masker = new Masker(subMat);
}


Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
unsigned int alphabetSize = (indexTable != NULL) ? static_cast<unsigned int>(indexTable->getAlphabetSize())
: static_cast<unsigned int>(subMat.alphabetSize);
Indexer idxer(alphabetSize, seq->getKmerSize());
Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern());

KmerGenerator *generator = NULL;
if (isTargetSimiliarKmerSearch) {
if (isTargetSimiliarKmerSearch && indexTable != NULL) {
generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
if(isProfile){
generator->setDivideStrategy(s.profile_matrix);
Expand All @@ -132,26 +119,21 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
// count similar or exact k-mers based on sequence type
if (isTargetSimiliarKmerSearch) {
// Find out if we should also mask profiles
totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence;
if (unmaskedLookup != NULL) {
(*unmaskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
} else if (maskedLookup != NULL) {
(*maskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
if(indexTable != NULL){
totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
}
unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence;

sequenceLookup->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);

} else {
// Do not mask if column state sequences are used
if (unmaskedLookup != NULL) {
(*unmaskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
}

maskedResidues += masker->maskSequence(s, mask, maskProb, maskLowerCaseMode, maskNrepeats);
sequenceLookup->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);

if(maskedLookup != NULL){
(*maskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
if(indexTable != NULL){
totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup);
}

totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup);
}
}

Expand All @@ -168,14 +150,13 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL


Debug(Debug::INFO) << "Index table: Masked residues: " << maskedResidues << "\n";
if(totalKmerCount == 0) {
Debug(Debug::ERROR) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n"
if(indexTable != NULL && totalKmerCount == 0) {
Debug(Debug::WARNING) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n"
<< "Maybe the sequences length is less than 14 residues.\n";
if (maskedResidues == true){
Debug(Debug::ERROR) << " or contains only low complexity regions.";
Debug(Debug::ERROR) << "Use --mask 0 to deactivate the low complexity filter.\n";
Debug(Debug::WARNING) << " or contains only low complexity regions.";
Debug(Debug::WARNING) << "Use --mask 0 to deactivate the low complexity filter.\n";
}
EXIT(EXIT_FAILURE);
}

dbr->remapData();
Expand All @@ -193,9 +174,10 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
// }
// Debug(Debug::INFO) << "Index table: Remove "<< lowSelectiveResidues <<" none selective residues\n";
// Debug(Debug::INFO) << "Index table: init... from "<< dbFrom << " to "<< dbTo << "\n";

indexTable->initMemory(info->tableSize);
indexTable->init();
if(indexTable != NULL){
indexTable->initMemory(info->tableSize);
indexTable->init();
}

delete info;
Debug::Progress progress2(dbTo-dbFrom);
Expand All @@ -208,7 +190,9 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif
Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern());
Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
unsigned int alphabetSize = (indexTable != NULL) ? static_cast<unsigned int>(indexTable->getAlphabetSize())
: static_cast<unsigned int>(subMat.alphabetSize);
Indexer idxer(alphabetSize, seq->getKmerSize());
IndexEntryLocalTmp *buffer = static_cast<IndexEntryLocalTmp *>(malloc( seq->getMaxLen() * sizeof(IndexEntryLocalTmp)));
size_t bufferSize = seq->getMaxLen();
KmerGenerator *generator = NULL;
Expand All @@ -229,10 +213,14 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
unsigned int qKey = dbr->getDbKey(id);
if (isTargetSimiliarKmerSearch) {
s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id));
indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
if(indexTable != NULL) {
indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
}
} else {
s.mapSequence(id - dbFrom, qKey, sequenceLookup->getSequence(id - dbFrom));
indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
if(indexTable != NULL) {
indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
}
}
}

Expand All @@ -245,6 +233,8 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
if(idScoreLookup!=NULL){
delete[] idScoreLookup;
}
indexTable->revertPointer();
indexTable->sortDBSeqLists();
if(indexTable != NULL){
indexTable->revertPointer();
indexTable->sortDBSeqLists();
}
}
3 changes: 1 addition & 2 deletions src/prefiltering/IndexBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

class IndexBuilder {
public:
static void fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup, SequenceLookup **unmaskedLookup,
BaseMatrix &subMat,
static void fillDatabase(IndexTable *indexTable, SequenceLookup **externalLookup, BaseMatrix &subMat,
ScoreMatrix & three, ScoreMatrix & two, Sequence *seq,
DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode);
Expand Down
4 changes: 1 addition & 3 deletions src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -531,11 +531,9 @@ void Prefiltering::getIndexTable(int split, size_t dbFrom, size_t dbSize) {
Parameters::isEqualDbtype(targetSeqType,Parameters::DBTYPE_AMINO_ACIDS))
? alphabetSize -1 : alphabetSize;
indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false);
SequenceLookup **unmaskedLookup = maskMode == 0 && maskLowerCaseMode == 0 ? &sequenceLookup : NULL;
SequenceLookup **maskedLookup = maskMode == 1 || maskLowerCaseMode == 1 ? &sequenceLookup : NULL;

Debug(Debug::INFO) << "Index table k-mer threshold: " << localKmerThr << " at k-mer size " << kmerSize << " \n";
IndexBuilder::fillDatabase(indexTable, maskedLookup, unmaskedLookup, *kmerSubMat,
IndexBuilder::fillDatabase(indexTable, &sequenceLookup, *kmerSubMat,
_3merSubMatrix, _2merSubMatrix,
&tseq, tdbr, dbFrom, dbFrom + dbSize,
localKmerThr, maskMode, maskLowerCaseMode,
Expand Down
81 changes: 42 additions & 39 deletions src/prefiltering/PrefilteringIndexReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,14 +190,14 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
(Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS))
? alphabetSize -1: alphabetSize;

const bool noPrefilter = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0;
if (noPrefilter) {
splits = 0;
const bool noKmerIndex = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0;
if (noKmerIndex) {
splits = 1;
}

ScoreMatrix s3;
ScoreMatrix s2;
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noPrefilter == false) {
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noKmerIndex == false) {
int alphabetSize = subMat->alphabetSize;
subMat->alphabetSize = subMat->alphabetSize-1;
s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
Expand Down Expand Up @@ -225,36 +225,53 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
continue;
}

IndexTable indexTable(adjustAlphabetSize, kmerSize, false);
IndexTable * indexTable;
if(noKmerIndex){
indexTable = NULL;
} else {
indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false);
}
SequenceLookup *sequenceLookup = NULL;
IndexBuilder::fillDatabase(&indexTable,
(maskMode == 1 || maskNrepeats > 0 || maskLowerCase == 1) ? &sequenceLookup : NULL,
(maskMode == 0 && maskNrepeats == 0 && maskLowerCase == 0) ? &sequenceLookup : NULL,
IndexBuilder::fillDatabase(indexTable, &sequenceLookup,
*subMat, s3, s2, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr,
maskMode, maskLowerCase, maskProb, maskNrepeats, targetSearchMode);
indexTable.printStatistics(subMat->num2aa);

if (sequenceLookup == NULL) {
Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n";
EXIT(EXIT_FAILURE);
}

// save the entries
unsigned int keyOffset = 1000 * s;
Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
char *entries = (char *) indexTable.getEntries();
size_t entriesSize = indexTable.getTableEntriesNum() * indexTable.getSizeOfEntry();
writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);

// save the size
Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
char *offsets = (char*)indexTable.getOffsets();
size_t offsetsSize = (indexTable.getTableSize() + 1) * sizeof(size_t);
writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);
indexTable.deleteEntries();

if(noKmerIndex == false){
indexTable->printStatistics(subMat->num2aa);
// save the entries
Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
char *entries = (char *) indexTable->getEntries();
size_t entriesSize = indexTable->getTableEntriesNum() * indexTable->getSizeOfEntry();
writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);

// save the size
Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
char *offsets = (char *) indexTable->getOffsets();
size_t offsetsSize = (indexTable->getTableSize() + 1) * sizeof(size_t);
writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);
indexTable->deleteEntries();

// ENTRIESNUM
Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
uint64_t entriesNum = indexTable->getTableEntriesNum();
char *entriesNumPtr = (char *) &entriesNum;
writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);

// SEQCOUNT
Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
size_t tablesize = indexTable->getSize();
char *tablesizePtr = (char *) &tablesize;
writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);
}
Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n";
int64_t seqindexDataSize = sequenceLookup->getDataSize();
char *seqindexDataSizePtr = (char *) &seqindexDataSize;
Expand All @@ -271,20 +288,6 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);
delete sequenceLookup;

// ENTRIESNUM
Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
uint64_t entriesNum = indexTable.getTableEntriesNum();
char *entriesNumPtr = (char *) &entriesNum;
writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);

// SEQCOUNT
Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
size_t tablesize = indexTable.getSize();
char *tablesizePtr = (char *) &tablesize;
writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
writer.alignToPageSize(SPLIT_INDX + s);
}

if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {
Expand Down

0 comments on commit 4766f92

Please sign in to comment.