Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Woosub-Kim committed Apr 14, 2024
2 parents 14160db + b6dac8a commit 7192be5
Show file tree
Hide file tree
Showing 15 changed files with 81 additions and 48 deletions.
2 changes: 1 addition & 1 deletion data/structuresearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ if [ "$ALIGNMENT_ALGO" = "tmalign" ]; then
if [ -n "${EXPAND}" ]; then
if notExists "${TMP_PATH}/strualn_expanded.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" mergeresultsbyset "${INTERMEDIATE}" "${TARGET_ALIGNMENT}_clu" "${TMP_PATH}/strualn_expanded" ${MERGERESULTBYSET_PAR} \
"$MMSEQS" mergeresultsbyset "${INTERMEDIATE}" "${TARGET_ALIGNMENT}_clu" "${TMP_PATH}/strualn_expanded" ${MERGERESULTBYSET_PAR} \
|| fail "Expand died"
fi
INTERMEDIATE="${TMP_PATH}/strualn_expanded"
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ std::vector<Command> baseCommands = {
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"pvalDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
{"mergeresultsbyset", mergeresultsbyset, &par.threadsandcompression,COMMAND_MULTIHIT,
{"mergeresultsbyset", mergeresultsbyset, &par.mergeresultsbyset, COMMAND_MULTIHIT,
"Merge results from multiple ORFs back to their respective contig",
NULL,
"Ruoshi Zhang, Clovis Norroy & Milot Mirdita <[email protected]>",
Expand Down
5 changes: 5 additions & 0 deletions lib/mmseqs/src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,11 @@ Parameters::Parameters():
combinepvalbyset.push_back(&PARAM_COMPRESSED);
combinepvalbyset.push_back(&PARAM_V);
// mergeresultsbyset
mergeresultsbyset.push_back(&PARAM_PRELOAD_MODE);
mergeresultsbyset.push_back(&PARAM_THREADS);
mergeresultsbyset.push_back(&PARAM_COMPRESSED);
mergeresultsbyset.push_back(&PARAM_V);
// offsetalignment
offsetalignment.push_back(&PARAM_CHAIN_ALIGNMENT);
Expand Down
1 change: 1 addition & 0 deletions lib/mmseqs/src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,7 @@ class Parameters {
std::vector<MMseqsParameter*> profile2seq;
std::vector<MMseqsParameter*> besthitbyset;
std::vector<MMseqsParameter*> combinepvalbyset;
std::vector<MMseqsParameter*> mergeresultsbyset;
std::vector<MMseqsParameter*> multihitdb;
std::vector<MMseqsParameter*> multihitsearch;
std::vector<MMseqsParameter*> expandaln;
Expand Down
2 changes: 2 additions & 0 deletions lib/mmseqs/src/commons/Sequence.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,8 @@ void Sequence::printProfile() const {
}
#ifdef GAP_POS_SCORING
printf("%3d %3d %3d\n", gDel[i] & 0xF, gDel[i] >> 4, gIns[i]);
else
printf("\n");
#endif
}
}
Expand Down
23 changes: 12 additions & 11 deletions lib/mmseqs/src/commons/SubstitutionMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrection(short *profileS
}

void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores,
int N, size_t alphabetSize, BaseMatrix *subMat) {
unsigned int N, size_t alphabetSize, BaseMatrix *subMat) {

const int windowSize = 40;

Expand All @@ -176,32 +176,33 @@ void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *prof

ProfileStates ps(alphabetSize,subMat->pBack);

for (int pos = 0; pos < N; pos++) {
for (unsigned int pos = 0; pos < N; pos++) {
for(size_t aa = 0; aa < alphabetSize; aa++) {
pnul[pos] += *(profileScores + pos + N*aa) * ps.prior[aa];
}
}

for (int i = 0; i < N; i++){
const int minPos = std::max(0, (i - windowSize/2));
const int maxPos = std::min(N, (i + windowSize/2));
for (unsigned int i = 0; i < N; i++){
const int minPos = std::max(0, ((int)i - windowSize/2));
const unsigned int maxPos = std::min(N, (i + windowSize/2));
const int windowLength = maxPos - minPos;
// negative score for the amino acids in the neighborhood of i
memset(aaSum, 0, sizeof(float) * alphabetSize);

for (int j = minPos; j < maxPos; j++){
if( i == j )
for (unsigned int j = minPos; j < maxPos; j++){
if (i == j) {
continue;
for(size_t aa = 0; aa < alphabetSize; aa++){
}
for (size_t aa = 0; aa < alphabetSize; aa++) {
aaSum[aa] += *(profileScores + aa*N + j) - pnul[j];
}
}
for(size_t aa = 0; aa < alphabetSize; aa++) {
for (size_t aa = 0; aa < alphabetSize; aa++) {
profileScores[i + aa*N] = static_cast<int8_t>(*(profileScores + i + N*aa) - aaSum[aa]/windowLength);
}
}
delete [] aaSum;
delete [] pnul;
delete[] aaSum;
delete[] pnul;
}


Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/commons/SubstitutionMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class SubstitutionMatrix : public BaseMatrix {
const int N,
size_t alphabetSize);
static void calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores,
int N,
unsigned int N,
size_t alphabetSize,
BaseMatrix *subMat);
static void calcGlobalAaBiasCorrection(const BaseMatrix * m,
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ Prefiltering::Prefiltering(const std::string &queryDB,


if (par.taxonList.length() > 0) {
taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList);
taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList, par.threads);
} else {
taxonomyHook = NULL;
}
Expand Down
27 changes: 21 additions & 6 deletions lib/mmseqs/src/prefiltering/QueryMatcherTaxonomyHook.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,47 @@
#include "DBReader.h"
#include "TaxonomyExpression.h"

#ifdef OPENMP
#include <omp.h>
#endif

class QueryMatcherTaxonomyHook : public QueryMatcherHook {
public:
QueryMatcherTaxonomyHook(std::string targetPath, DBReader<unsigned int>* targetReader, const std::string& expressionString)
: targetReader(targetReader), dbFrom(0) {
QueryMatcherTaxonomyHook(std::string targetPath, DBReader<unsigned int>* targetReader, const std::string& expressionString, unsigned int threads)
: targetReader(targetReader), dbFrom(0), threads(threads) {
std::string targetName = dbPathWithoutIndex(targetPath);
taxonomy = NcbiTaxonomy::openTaxonomy(targetName);
taxonomyMapping = new MappingReader(targetName);
expression = new TaxonomyExpression(expressionString, *taxonomy);
expression = new TaxonomyExpression*[threads];
for (unsigned int i = 0; i < threads; i++) {
expression[i] = new TaxonomyExpression(expressionString, *taxonomy);
}
}

~QueryMatcherTaxonomyHook() {
delete taxonomy;
delete taxonomyMapping;
delete expression;
for (unsigned int i = 0; i < threads; i++) {
delete expression[i];
}
delete[] expression;
}

void setDbFrom(unsigned int from) {
dbFrom = from;
}

size_t afterDiagonalMatchingHook(QueryMatcher& matcher, size_t resultSize) {
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = (unsigned int) omp_get_thread_num();
#endif
size_t writePos = 0;
for (size_t i = 0; i < resultSize; i++) {
unsigned int currId = matcher.foundDiagonals[i].id;
unsigned int key = targetReader->getDbKey(dbFrom + currId);
TaxID currTax = taxonomyMapping->lookup(key);
if (expression->isAncestor(currTax)) {
if (expression[thread_idx]->isAncestor(currTax)) {
if (i != writePos) {
matcher.foundDiagonals[writePos] = matcher.foundDiagonals[i];
}
Expand Down Expand Up @@ -63,9 +77,10 @@ class QueryMatcherTaxonomyHook : public QueryMatcherHook {
NcbiTaxonomy* taxonomy;
MappingReader* taxonomyMapping;
DBReader<unsigned int>* targetReader;
TaxonomyExpression* expression;
TaxonomyExpression** expression;

unsigned int dbFrom;
unsigned int threads;
};

#endif
6 changes: 3 additions & 3 deletions lib/mmseqs/src/prefiltering/ungappedprefilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m


QueryMatcherTaxonomyHook * taxonomyHook = NULL;
if(par.PARAM_TAXON_LIST.wasSet){
taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList);
if (par.PARAM_TAXON_LIST.wasSet) {
taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList, par.threads);
}

Debug::Progress progress(qdbr->getSize());
Expand Down Expand Up @@ -121,7 +121,7 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m
unsigned int targetKey = tdbr->getDbKey(tId);
if(taxonomyHook != NULL){
TaxID currTax = taxonomyHook->taxonomyMapping->lookup(targetKey);
if (taxonomyHook->expression->isAncestor(currTax) == false) {
if (taxonomyHook->expression[thread_idx]->isAncestor(currTax) == false) {
continue;
}
}
Expand Down
37 changes: 23 additions & 14 deletions lib/mmseqs/src/test/TestDiagonalScoring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,12 @@ int main (int, const char**) {
// std::cout << compositionBias[i] << " ";
// }
// std::cout << std::endl;
// matcher.createProfile(&s5, compositionBias);
// for(size_t i = 0; i < s6.L; i++){
// hits[0].id = s6.getId();
// hits[0].diagonal = 0;
// hits[0].count = 0;
// matcher.processQuery(&s5, compositionBias, hits, 1, 0);
// matcher.align(hits, 1, 0);
// std::cout << hits[0].diagonal << " " << (int)hits[0].count << std::endl;
// }

Expand All @@ -121,75 +122,83 @@ int main (int, const char**) {

hits[0].id = s1.getId();
hits[0].diagonal = 0;
matcher.processQuery(&s1, compositionBias, hits, 1);
matcher.createProfile(&s1, compositionBias);
matcher.align(hits, 1);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

for(int i = 0; i < 16; i++){
hits[i].id = s1.getId();
hits[i].diagonal = 0;
}
matcher.processQuery(&s1, compositionBias, hits, 16);
matcher.align(hits, 16);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;


matcher.createProfile(&s2, compositionBias);
hits[0].id = s1.getId();
hits[0].diagonal = 9;
matcher.processQuery(&s2, compositionBias, hits, 1);
matcher.align(hits, 1);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

for(int i = 0; i < 16; i++){
hits[i].id = s1.getId();
hits[i].diagonal = 9;
}
matcher.processQuery(&s2, compositionBias, hits, 16);
matcher.align(hits, 16);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

for(int i = 0; i < 16; i++){
hits[i].id = s2.getId();
hits[i].diagonal = -9;
}
matcher.processQuery(&s1, compositionBias, hits, 16);
matcher.createProfile(&s1, compositionBias);
matcher.align(hits, 16);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

matcher.processQuery(&s1, compositionBias, hits, 1);
matcher.align(hits, 1);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

for(int i = 0; i < 16; i++){
hits[i].id = s2.getId();
hits[i].diagonal = -9;
}
matcher.processQuery(&s3, compositionBias, hits, 16);
matcher.createProfile(&s3, compositionBias);
matcher.align(hits, 16);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

matcher.processQuery(&s3, compositionBias, hits, 1);
matcher.align(hits, 1);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;


hits[0].id = s4.getId();
hits[0].diagonal = -256;
matcher.processQuery(&s1, compositionBias, hits, 1);
matcher.createProfile(&s1, compositionBias);
matcher.align(hits, 1);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;


hits[0].id = s1.getId();
hits[0].diagonal = 256;
matcher.processQuery(&s4,compositionBias, hits, 1);
matcher.createProfile(&s4, compositionBias);
matcher.align(hits, 1);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

hits[0].id = s7.getId();
hits[0].diagonal = -512;
matcher.processQuery(&s1,compositionBias, hits, 16);
matcher.createProfile(&s1, compositionBias);
matcher.align(hits, 16);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

hits[0].id = s1.getId();
hits[0].diagonal = 512;
matcher.processQuery(&s7,compositionBias, hits, 16);
matcher.createProfile(&s7, compositionBias);
matcher.align(hits, 16);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;


hits[0].id = s7.getId();
hits[0].diagonal = 0;
matcher.processQuery(&s7, compositionBias, hits, 16);
matcher.align(hits, 16);
std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl;

delete [] compositionBias;
Expand Down
14 changes: 7 additions & 7 deletions lib/mmseqs/src/test/TestDiagonalScoringPerformance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,17 @@ int main (int, const char**) {
SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias, 1.0);



matcher.processQuery(&s1,compositionBias, hits, 16);
matcher.createProfile(&s1, compositionBias);
matcher.align(hits, 16);
std::cout << (int)hits[0].count << " ";
std::cout << (int)hits[1].count << " ";
std::cout << (int)hits[2].count << " ";
std::cout << (int)hits[3].count << std::endl;

matcher.processQuery(&s1, compositionBias, hits, 1);
matcher.processQuery(&s1, compositionBias, hits + 1, 1);
matcher.processQuery(&s1, compositionBias, hits + 2, 1);
matcher.processQuery(&s1, compositionBias, hits + 3, 1);
matcher.align(hits, 1);
matcher.align(hits + 1, 1);
matcher.align(hits + 2, 1);
matcher.align(hits + 3, 1);

std::cout << (int)hits[0].count<< " ";
std::cout << (int)hits[1].count<< " ";
Expand All @@ -114,7 +114,7 @@ int main (int, const char**) {
hits[j].diagonal = rand()%s1.L;
}
// std::reverse(hits, hits+1000);
matcher.processQuery(&s1, compositionBias, hits, 16000);
matcher.align(hits, 16000);
}
// std::cout << ExtendedSubstitutionMatrix::calcScore(s1.sequence, s1.sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].diagonalScore << std::endl;
// std::cout << (int)hits[0].diagonalScore << std::endl;
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/util/expandaln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,8 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
continue;
}
} else {
size_t cSeqId = cReader->getId(cSeqKey);
if (returnAlnRes == false || par.expansionMode == Parameters::EXPAND_RESCORE_BACKTRACE) {
size_t cSeqId = cReader->getId(cSeqKey);
cSeq.mapSequence(cSeqId, cSeqKey, cReader->getData(cSeqId, thread_idx), cReader->getSeqLen(cSeqId));
}
//rescoreResultByBacktrace(resultAc, aSeq, cSeq, subMat, compositionBias, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid());
Expand Down
2 changes: 1 addition & 1 deletion src/strucclustutils/convert2pdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ int convert2pdb(int argc, const char **argv, const Command& command) {
aa = 'X';
}
const char* aa3 = threeLetterLookup[(int)(aa - 'A')];
fprintf(threadHandle, "ATOM % 5d CA %s %c% 4d% 12.3f% 8.3f% 8.3f\n", (int)(j + 1), aa3, chainName[0], int(j + 1), ca[j], ca[j + (1 * seqLen)], ca[j + (2 * seqLen)]);
fprintf(threadHandle, "ATOM %5d CA %s %c%4d %8.3f%8.3f%8.3f\n", (int)(j + 1), aa3, chainName[0], int(j + 1), ca[j], ca[j + (1 * seqLen)], ca[j + (2 * seqLen)]);
}
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
fprintf(threadHandle, "ENDMDL\n");
Expand Down
2 changes: 1 addition & 1 deletion src/workflow/StructureSearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ int structuresearch(int argc, const char **argv, const Command &command) {
EXIT(EXIT_FAILURE);
}
}
cmd.addVariable("MERGERESULTBYSET_PAR", par.createParameterString(par.threadsandcompression).c_str());
cmd.addVariable("MERGERESULTBYSET_PAR", par.createParameterString(par.mergeresultsbyset).c_str());
cmd.addVariable("EXPAND", "1");
}
std::string program = tmpDir + "/structuresearch.sh";
Expand Down

0 comments on commit 7192be5

Please sign in to comment.