Skip to content

Commit

Permalink
convert2pdb can now write separate PDB files per chain or complex
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Mar 12, 2024
1 parent 9a8b088 commit 346c1dd
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 45 deletions.
6 changes: 3 additions & 3 deletions src/FoldseekBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,11 @@ std::vector<Command> foldseekCommands = {
"<i:DB> <o:caDB>",
CITATION_FOLDSEEK, {{"Db", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &FoldSeekDbValidator::sequenceDb },
{"caDb", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::cadb }}},
{"convert2pdb", convert2pdb, &localPar.onlyverbosity, COMMAND_FORMAT_CONVERSION,
"Convert a foldseek structure db to a multi model PDB file",
{"convert2pdb", convert2pdb, &localPar.convert2pdb, COMMAND_FORMAT_CONVERSION,
"Convert a foldseek structure db to a single multi model PDB file or a directory of PDB files",
NULL,
"Milot Mirdita <[email protected]>",
"<i:Db> <o:pdbFile>",
"<i:Db> <o:pdbFile|pdbDir>",
CITATION_FOLDSEEK, {{"Db", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
{"pdbFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
{"scorecomplex", scorecomplex, &localPar.scorecomplex, COMMAND_ALIGNMENT,
Expand Down
8 changes: 7 additions & 1 deletion src/commons/LocalParameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ LocalParameters::LocalParameters() :
PARAM_INDEX_EXCLUDE(PARAM_INDEX_EXCLUDE_ID, "--index-exclude", "Index Exclusion", "Exclude parts of the index:\n0: Full index\n1: Exclude k-mer index (for use with --prefilter-mode 1)\n2: Exclude C-alpha coordinates (for use with --sort-by-structure-bits 0)\nFlags can be combined bit wise", typeid(int), (void *) &indexExclude, "^[0-3]{1}$", MMseqsParameter::COMMAND_EXPERT),
PARAM_COMPLEX_REPORT_MODE(PARAM_COMPLEX_REPORT_MODE_ID, "--complex-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &complexReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN),
PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$")
PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"),
PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC)
{
PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id";
PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$";
Expand Down Expand Up @@ -196,6 +197,11 @@ LocalParameters::LocalParameters() :
expandcomplex.push_back(&PARAM_THREADS);
expandcomplex.push_back(&PARAM_V);

// convert2pdb
convert2pdb.push_back(&PARAM_PDB_OUTPUT_MODE);
convert2pdb.push_back(&PARAM_THREADS);
convert2pdb.push_back(&PARAM_V);

prefMode = PREF_MODE_KMER;
alignmentType = ALIGNMENT_TYPE_3DI_AA;
tmScoreThr = 0.0;
Expand Down
8 changes: 8 additions & 0 deletions src/commons/LocalParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ class LocalParameters : public Parameters {
static const int INDEX_EXCLUDE_KMER_INDEX = 1 << 0;
static const int INDEX_EXCLUDE_CA = 1 << 1;

// convert2pdb
static const int PDB_OUTPUT_MODE_MULTIMODEL = 0;
static const int PDB_OUTPUT_MODE_SINGLECHAIN = 1;
static const int PDB_OUTPUT_MODE_COMPLEX = 2;

// TODO
static const unsigned int FORMAT_ALIGNMENT_PDB_SUPERPOSED = 5;
std::vector<MMseqsParameter *> strucclust;
Expand All @@ -93,6 +98,7 @@ class LocalParameters : public Parameters {
std::vector<MMseqsParameter *> easyscomplexsearchworkflow;
std::vector<MMseqsParameter *> createcomplexreport;
std::vector<MMseqsParameter *> expandcomplex;
std::vector<MMseqsParameter *> convert2pdb;

PARAMETER(PARAM_PREF_MODE)
PARAMETER(PARAM_TMSCORE_THRESHOLD)
Expand All @@ -115,6 +121,7 @@ class LocalParameters : public Parameters {
PARAMETER(PARAM_COMPLEX_REPORT_MODE)
PARAMETER(PARAM_EXPAND_COMPLEX_EVALUE)
PARAMETER(PARAM_INPUT_FORMAT)
PARAMETER(PARAM_PDB_OUTPUT_MODE)

int prefMode;
float tmScoreThr;
Expand All @@ -137,6 +144,7 @@ class LocalParameters : public Parameters {
int complexReportMode;
double eValueThrExpandComplex;
int inputFormat;
int pdbOutputMode;

static std::vector<int> getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner,
Expand Down
234 changes: 193 additions & 41 deletions src/strucclustutils/convert2pdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,69 +7,221 @@
#include "Util.h"
#include "FileUtil.h"
#include "Coordinate16.h"
#include "createcomplexreport.h"

#ifdef OPENMP
#include <omp.h>
#endif

class KeyIterator {
public:
virtual ~KeyIterator() {}
virtual size_t getSize() const = 0;
virtual std::pair<const unsigned int*, size_t> getDbKeys(size_t index) = 0;
};

class DbKeyIterator : public KeyIterator {
public:
DbKeyIterator(DBReader<unsigned int>& db) : db(db) {}
~DbKeyIterator() {}

size_t getSize() const override {
return db.getSize();
}

std::pair<const unsigned int*, size_t> getDbKeys(size_t index) override {
key = db.getDbKey(index);
return std::make_pair(&key, 1ull);
}

private:
DBReader<unsigned int>& db;
unsigned int key;
};

class MapIterator : public KeyIterator {
public:
MapIterator(const complexIdToChainKeys_t& map, const std::vector<unsigned int>& complexIndices)
: map(map), complexIndices(complexIndices) {}
~MapIterator() {}

size_t getSize() const override {
return complexIndices.size();
}

std::pair<const unsigned int*, size_t> getDbKeys(size_t index) override {
unsigned int key = complexIndices[index];
const std::vector<unsigned int>& currentKeys = map.at(key);
return std::make_pair(currentKeys.data(), currentKeys.size());
}

private:
const complexIdToChainKeys_t& map;
const std::vector<unsigned int>& complexIndices;
};

void writeTitle(FILE* handle, const char* headerData, size_t headerLen) {
int remainingHeader = headerLen;
fprintf(handle, "TITLE %.*s\n", std::min(70, (int)remainingHeader), headerData);
remainingHeader -= 70;
int continuation = 2;
while (remainingHeader > 0) {
fprintf(handle, "TITLE % 3d%.*s\n", continuation, std::min(70, (int)remainingHeader), headerData + (headerLen - remainingHeader));
remainingHeader -= 70;
continuation++;
}
}

const char* threeLetterLookup[26] = { "ALA", "ASX", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "XLE", "LYS", "LEU", "MET", "ASN", "PYL", "PRO", "GLN", "ARG", "SER", "THR", "SEC", "VAL", "TRP", "XAA", "TYR", "GLX" };
int convert2pdb(int argc, const char **argv, const Command& command) {
Parameters& par = Parameters::getInstance();
LocalParameters& par = LocalParameters::getLocalInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

DBReader<unsigned int> db(par.db1.c_str(), par.db1Index.c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
int outputMode = par.pdbOutputMode;
int localThreads = par.threads;
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
localThreads = 1;
} else {
localThreads = par.threads;
}

int mode = DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX;
if (outputMode != LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
mode |= DBReader<unsigned int>::USE_LOOKUP;
}
DBReader<unsigned int> db(par.db1.c_str(), par.db1Index.c_str(), localThreads, mode);
db.open(DBReader<unsigned int>::NOSORT);

DBReader<unsigned int> db_header(par.hdr1.c_str(), par.hdr1Index.c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
DBReader<unsigned int> db_header(par.hdr1.c_str(), par.hdr1Index.c_str(), localThreads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
db_header.open(DBReader<unsigned int>::NOSORT);

std::string dbCa = par.db1 + "_ca";
std::string dbCaIndex = par.db1 + "_ca.index";
DBReader<unsigned int> db_ca(dbCa.c_str(), dbCaIndex.c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
DBReader<unsigned int> db_ca(dbCa.c_str(), dbCaIndex.c_str(), localThreads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
db_ca.open(DBReader<unsigned int>::NOSORT);

FILE* handle = fopen(par.db2.c_str(), "w");
if(handle == NULL) {
perror(par.db2.c_str());
EXIT(EXIT_FAILURE);
std::string lookupFile = par.db1 + ".lookup";
chainKeyToComplexId_t chainKeyToComplexIdMap;
complexIdToChainKeys_t complexIdToChainKeysMap;
std::vector<unsigned int> complexIndices;
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_COMPLEX) {
getKeyToIdMapIdToKeysMapIdVec(lookupFile, chainKeyToComplexIdMap, complexIdToChainKeysMap, complexIndices);
}

Coordinate16 coords;
Debug(Debug::INFO) << "Start writing file to " << par.db2 << "\n";
for(size_t i = 0; i < db.getSize(); i++){
unsigned int key = db.getDbKey(i);
unsigned int headerId = db_header.getId(key);
const char* headerData = db_header.getData(headerId, 0);
const size_t headerLen = db_header.getEntryLen(headerId) - 2;

unsigned int seqId = db.getId(key);
const char* seqData = db.getData(seqId, 0);
const size_t seqLen = std::max(db.getEntryLen(seqId), (size_t)2) - 2;
FILE* handle;
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
handle = fopen(par.db2.c_str(), "w");
if (handle == NULL) {
perror(par.db2.c_str());
EXIT(EXIT_FAILURE);
}
} else {
if (mkdir(par.db2.c_str(), 0755) != 0) {
if (errno != EEXIST) {
perror(par.db2.c_str());
EXIT(EXIT_FAILURE);
}
}
}

unsigned int caId = db_ca.getId(key);
const char* caData = db_ca.getData(caId, 0);
const size_t caLen = db_ca.getEntryLen(caId);
const char* threeLetterLookup[26] = { "ALA", "ASX", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "XLE", "LYS", "LEU", "MET", "ASN", "PYL", "PRO", "GLN", "ARG", "SER", "THR", "SEC", "VAL", "TRP", "XAA", "TYR", "GLX" };

float* ca = coords.read(caData, seqLen, caLen);
#pragma omp parallel num_threads(localThreads)
{
int thread_idx = 0;
#ifdef OPENMP
thread_idx = omp_get_thread_num();
#endif
FILE* threadHandle = NULL;
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
threadHandle = handle;
}
Coordinate16 coords;

fprintf(handle, "MODEL % 8d\n", key);
int remainingHeader = headerLen;
fprintf(handle, "TITLE %.*s\n", std::min(70, (int)remainingHeader), headerData);
remainingHeader -= 70;
int continuation = 2;
while (remainingHeader > 0) {
fprintf(handle, "TITLE % 3d%.*s\n", continuation, std::min(70, (int)remainingHeader), headerData + (headerLen - remainingHeader));
remainingHeader -= 70;
continuation++;
KeyIterator* keyIterator;
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_COMPLEX) {
keyIterator = new MapIterator(complexIdToChainKeysMap, complexIndices);
} else {
keyIterator = new DbKeyIterator(db);
}
for (size_t j = 0; j < seqLen; ++j) {
// make AA upper case
char aa = seqData[j] & ~0x20;
if (aa == '\0') {
aa = 'X';
const size_t size = keyIterator->getSize();
#pragma omp for schedule(dynamic, 1)
for (size_t i = 0; i < size; ++i) {
std::pair<const unsigned int*, size_t> keys = keyIterator->getDbKeys(i);
if (outputMode != LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
unsigned int key = keys.first[0];
std::string name = db.getLookupEntryName(key);
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_COMPLEX) {
std::string chain = name.substr(name.find_last_of('_') + 1);
if (chain.size() == 0) {
// keep the last underscore, if there is no chain
name = name.substr(0, name.find_last_of('_') + 1);
} else {
name = name.substr(0, name.find_last_of('_'));
}
}
std::string filename = par.db2 + "/" + name + ".pdb";
threadHandle = fopen(filename.c_str(), "w");
if (threadHandle == NULL) {
perror(filename.c_str());
EXIT(EXIT_FAILURE);
}

unsigned int headerId = db_header.getId(key);
const char* headerData = db_header.getData(headerId, thread_idx);
const size_t headerLen = db_header.getEntryLen(headerId) - 2;
writeTitle(threadHandle, headerData, headerLen);
}

std::string chainName = "A";
for (size_t j = 0; j < keys.second; ++j) {
unsigned int key = keys.first[j];

unsigned int seqId = db.getId(key);
const char* seqData = db.getData(seqId, thread_idx);
const size_t seqLen = std::max(db.getEntryLen(seqId), (size_t)2) - 2;

unsigned int caId = db_ca.getId(key);
const char* caData = db_ca.getData(caId, thread_idx);
const size_t caLen = db_ca.getEntryLen(caId);

float* ca = coords.read(caData, seqLen, caLen);

if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
fprintf(threadHandle, "MODEL % 8d\n", key);

unsigned int headerId = db_header.getId(key);
const char* headerData = db_header.getData(headerId, thread_idx);
const size_t headerLen = db_header.getEntryLen(headerId) - 2;
writeTitle(threadHandle, headerData, headerLen);
} else {
std::string name = db.getLookupEntryName(key);
chainName = name.substr(name.find_last_of('_') + 1);
if (chainName.size() == 0) {
chainName = "A";
}
}
for (size_t j = 0; j < seqLen; ++j) {
// make AA upper case
char aa = seqData[j] & ~0x20;
if (aa == '\0') {
aa = 'X';
}
const char* aa3 = threeLetterLookup[(int)(aa - 'A')];
fprintf(threadHandle, "ATOM % 5d CA %s %c% 4d% 12.3f% 8.3f% 8.3f\n", (int)(j + 1), aa3, chainName[0], int(j + 1), ca[j], ca[j + (1 * seqLen)], ca[j + (2 * seqLen)]);
}
if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
fprintf(threadHandle, "ENDMDL\n");
}
}
if (outputMode != LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) {
fclose(threadHandle);
}
const char* aa3 = threeLetterLookup[(int)(aa - 'A')];
fprintf(handle, "ATOM % 5d CA %s A% 4d% 12.3f% 8.3f% 8.3f\n", (int)(j + 1), aa3, int(j + 1), ca[j], ca[j + (1 * seqLen)], ca[j + (2 * seqLen)]);
}
fprintf(handle, "ENDMDL\n");
delete keyIterator;
}
if (fclose(handle) != 0) {

if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL && fclose(handle) != 0) {
Debug(Debug::ERROR) << "Cannot close file " << par.db2 << "\n";
EXIT(EXIT_FAILURE);
}
Expand Down

0 comments on commit 346c1dd

Please sign in to comment.