Skip to content

Commit

Permalink
Rework createdb to correctly allow for only one directory or (new) ts…
Browse files Browse the repository at this point in the history
…v input, in additonal to loose files
  • Loading branch information
milot-mirdita committed Mar 4, 2024
1 parent 0c3b7f2 commit e1394aa
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 13 deletions.
16 changes: 12 additions & 4 deletions src/FoldseekBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,19 @@ void (*validatorUpdate)(void) = updateValdiation;

std::vector<Command> foldseekCommands = {
{"createdb", structcreatedb, &localPar.structurecreatedb, COMMAND_MAIN,
"Convert PDB/mmCIF/tar[.gz]/DB files to a db",
"Convert PDB/mmCIF/tar[.gz]/DB files to a db",
"Convert PDB/mmCIF/tar[.gz]/DB files or directory/TSV to a structure DB",
"# Process multiple files\n"
"foldseek createdb examples/1tim.pdb.gz examples/8tim.pdb.gz DB\n"
"# Process a directory containing PDB|mmCIF[.gz]|tar[.gz]|DB recursively, only one directory can be given\n"
"foldseek createdb examples/ DB\n"
"# Process a TSV file with a list of PDB|mmCIF[.gz]|tar[.gz]|DB, only one TSV can be given\n"
"foldseek createdb examples.tsv DB\n"
"# Process a directory or tar file and filter based on file name\n"
"# Note: --file-include and --file-exclude only apply to directory or tar input\n"
"foldseek createdb examples/ --file-include \"pdb.gz$\"\n",
"Martin Steinegger <[email protected]>",
"<i:PDB|mmCIF[.gz]|tar|DB> ... <i:PDB|mmCIF[.gz]|tar|DB> <o:sequenceDB>",
CITATION_FOLDSEEK, {{"PDB|mmCIF[.gz]|stdin|tar|DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC,
"<i:directory|.tsv>|<i:PDB|mmCIF[.gz]|tar[.gz]|DB> ... <i:PDB|mmCIF[.gz]|tar|DB> <o:sequenceDB>",
CITATION_FOLDSEEK, {{"PDB|mmCIF[.gz]|stdin|tar[.gz]|DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC,
#ifdef HAVE_GCS
&DbValidator::flatfileStdinGenericUri
#else
Expand Down
47 changes: 38 additions & 9 deletions src/strucclustutils/structcreatedb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,21 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
std::string outputName = par.filenames.back();
par.filenames.pop_back();


PatternCompiler include(par.fileInclude.c_str());
PatternCompiler exclude(par.fileExclude.c_str());
if (par.filenames.size() == 1 && FileUtil::directoryExists(par.filenames.back().c_str())) {

for (size_t i = 1; i < par.filenames.size(); ++i) {
if (FileUtil::directoryExists(par.filenames[i].c_str()) || Util::endsWith(".tsv", par.filenames[i].c_str())) {
Debug(Debug::ERROR) << "Only one directory or tsv file (" << par.filenames[i] << ") or a list of files can be given\n";
EXIT(EXIT_FAILURE);
}
}

if (FileUtil::directoryExists(par.filenames[0].c_str())) {
if (par.filenames.size() > 1) {
Debug(Debug::ERROR) << "Only one directory can be given\n";
EXIT(EXIT_FAILURE);
}
std::vector<std::string> dirs;
dirs.push_back(par.filenames.back());
par.filenames.pop_back();
Expand All @@ -266,22 +277,40 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
}
while (dirent* entry = readdir(handle)) {
std::string filename(entry->d_name);

if (filename != "." && filename !="..") {
if (filename != "." && filename != "..") {
std::string fullpath = dir + "/" + filename;
struct stat info;
stat(fullpath.c_str(), &info);
if (info.st_mode & S_IFDIR) {
if (info.st_mode & S_IFDIR) {
dirs.push_back(fullpath);
} else {
if (include.isMatch(filename.c_str()) == true && exclude.isMatch(filename.c_str()) == false) {
par.filenames.push_back(fullpath);
}
} else if (include.isMatch(filename.c_str()) == true && exclude.isMatch(filename.c_str()) == false) {
par.filenames.push_back(fullpath);
}
}
}
closedir(handle);
}
} else if (Util::endsWith(".tsv", par.filenames[0])) {
if (par.filenames.size() > 1) {
Debug(Debug::ERROR) << "Only one tsv file can be given\n";
EXIT(EXIT_FAILURE);
}
std::string tsv = par.filenames.back();
par.filenames.pop_back();

FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true);
char* line = NULL;
size_t len = 0;
ssize_t read;
while ((read = getline(&line, &len, file)) != -1) {
if (line[read - 1] == '\n') {
line[read - 1] = '\0';
read--;
}
par.filenames.push_back(line);
}
free(line);
fclose(file);
}

Debug(Debug::INFO) << "Output file: " << outputName << "\n";
Expand Down

0 comments on commit e1394aa

Please sign in to comment.