diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 63bc8cc4..f0fd6082 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -16,11 +16,19 @@ void (*validatorUpdate)(void) = updateValdiation; std::vector foldseekCommands = { {"createdb", structcreatedb, &localPar.structurecreatedb, COMMAND_MAIN, - "Convert PDB/mmCIF/tar[.gz]/DB files to a db", - "Convert PDB/mmCIF/tar[.gz]/DB files to a db", + "Convert PDB/mmCIF/tar[.gz]/DB files or directory/TSV to a structure DB", + "# Process multiple files\n" + "foldseek createdb examples/1tim.pdb.gz examples/8tim.pdb.gz DB\n" + "# Process a directory containing PDB|mmCIF[.gz]|tar[.gz]|DB recursively, only one directory can be given\n" + "foldseek createdb examples/ DB\n" + "# Process a TSV file with a list of PDB|mmCIF[.gz]|tar[.gz]|DB, only one TSV can be given\n" + "foldseek createdb examples.tsv DB\n" + "# Process a directory or tar file and filter based on file name\n" + "# Note: --file-include and --file-exclude only apply to directory or tar input\n" + "foldseek createdb examples/ --file-include \"pdb.gz$\"\n", "Martin Steinegger ", - " ... ", - CITATION_FOLDSEEK, {{"PDB|mmCIF[.gz]|stdin|tar|DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, + "| ... ", + CITATION_FOLDSEEK, {{"PDB|mmCIF[.gz]|stdin|tar[.gz]|DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, #ifdef HAVE_GCS &DbValidator::flatfileStdinGenericUri #else diff --git a/src/strucclustutils/structcreatedb.cpp b/src/strucclustutils/structcreatedb.cpp index 0abb9378..a198bff2 100644 --- a/src/strucclustutils/structcreatedb.cpp +++ b/src/strucclustutils/structcreatedb.cpp @@ -250,10 +250,21 @@ int structcreatedb(int argc, const char **argv, const Command& command) { std::string outputName = par.filenames.back(); par.filenames.pop_back(); - PatternCompiler include(par.fileInclude.c_str()); PatternCompiler exclude(par.fileExclude.c_str()); - if (par.filenames.size() == 1 && FileUtil::directoryExists(par.filenames.back().c_str())) { + + for (size_t i = 1; i < par.filenames.size(); ++i) { + if (FileUtil::directoryExists(par.filenames[i].c_str()) || Util::endsWith(".tsv", par.filenames[i].c_str())) { + Debug(Debug::ERROR) << "Only one directory or tsv file (" << par.filenames[i] << ") or a list of files can be given\n"; + EXIT(EXIT_FAILURE); + } + } + + if (FileUtil::directoryExists(par.filenames[0].c_str())) { + if (par.filenames.size() > 1) { + Debug(Debug::ERROR) << "Only one directory can be given\n"; + EXIT(EXIT_FAILURE); + } std::vector dirs; dirs.push_back(par.filenames.back()); par.filenames.pop_back(); @@ -266,22 +277,40 @@ int structcreatedb(int argc, const char **argv, const Command& command) { } while (dirent* entry = readdir(handle)) { std::string filename(entry->d_name); - - if (filename != "." && filename !="..") { + if (filename != "." && filename != "..") { std::string fullpath = dir + "/" + filename; struct stat info; stat(fullpath.c_str(), &info); - if (info.st_mode & S_IFDIR) { + if (info.st_mode & S_IFDIR) { dirs.push_back(fullpath); - } else { - if (include.isMatch(filename.c_str()) == true && exclude.isMatch(filename.c_str()) == false) { - par.filenames.push_back(fullpath); - } + } else if (include.isMatch(filename.c_str()) == true && exclude.isMatch(filename.c_str()) == false) { + par.filenames.push_back(fullpath); } } } closedir(handle); } + } else if (Util::endsWith(".tsv", par.filenames[0])) { + if (par.filenames.size() > 1) { + Debug(Debug::ERROR) << "Only one tsv file can be given\n"; + EXIT(EXIT_FAILURE); + } + std::string tsv = par.filenames.back(); + par.filenames.pop_back(); + + FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true); + char* line = NULL; + size_t len = 0; + ssize_t read; + while ((read = getline(&line, &len, file)) != -1) { + if (line[read - 1] == '\n') { + line[read - 1] = '\0'; + read--; + } + par.filenames.push_back(line); + } + free(line); + fclose(file); } Debug(Debug::INFO) << "Output file: " << outputName << "\n";