From 94a0d7aaf400e0cd630682b9be42532f6b367380 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 24 Jul 2018 07:52:10 -0400 Subject: [PATCH] added option to get output as matrix --- src/cgi/core_genome_identity.cpp | 4 +- src/cgi/include/computeCoreIdentity.hpp | 92 ++++++++++++++++++++++++- src/map/include/map_parameters.hpp | 1 + src/map/include/parseCmdArgs.hpp | 10 ++- 4 files changed, 104 insertions(+), 3 deletions(-) diff --git a/src/cgi/core_genome_identity.cpp b/src/cgi/core_genome_identity.cpp index a0d0087..1df8c1a 100644 --- a/src/cgi/core_genome_identity.cpp +++ b/src/cgi/core_genome_identity.cpp @@ -53,7 +53,6 @@ int main(int argc, char** argv) //Initialize the files to delete the existing content { - std::ofstream outstrm1(fileName); #ifdef DEBUG std::ofstream outstrm2(fileName + ".map.1way"); std::ofstream outstrm3(fileName + ".map.2way"); @@ -88,4 +87,7 @@ int main(int argc, char** argv) } cgi::outputCGI (parameters, finalResults, fileName); + + if (parameters.matrixOutput) + cgi::outputPhylip (parameters, finalResults, fileName); } diff --git a/src/cgi/include/computeCoreIdentity.hpp b/src/cgi/include/computeCoreIdentity.hpp index 382ccf9..87259e4 100644 --- a/src/cgi/include/computeCoreIdentity.hpp +++ b/src/cgi/include/computeCoreIdentity.hpp @@ -289,7 +289,7 @@ namespace cgi //sort result by identity std::sort(CGI_ResultsVector.rbegin(), CGI_ResultsVector.rend()); - std::ofstream outstrm(fileName, std::ios::app); + std::ofstream outstrm(fileName); //Report results for(auto &e : CGI_ResultsVector) @@ -307,6 +307,96 @@ namespace cgi outstrm.close(); } + + /** + * @brief output FastANI results as lower triangular matrix + * @param[in] parameters algorithm parameters + * @param[in] CGI_ResultsVector results + * @param[in] fileName file name where results will be reported + */ + void outputPhylip(skch::Parameters ¶meters, + std::vector &CGI_ResultsVector, + std::string &fileName) + { + std::unordered_map genome2Int; // name of genome -> integer + std::unordered_map genome2Int_rev; // integer -> name of genome + + //Assign unique index to the set of query and reference genomes + for(auto &e : parameters.querySequences) + { + auto id = genome2Int.size(); + if( genome2Int.find(e) == genome2Int.end() ) + { + genome2Int [e] = id; + genome2Int_rev [id] = e; + } + } + + for(auto &e : parameters.refSequences) + { + auto id = genome2Int.size(); + if( genome2Int.find(e) == genome2Int.end() ) + { + genome2Int [e] = id; + genome2Int_rev [id] = e; + } + } + + int totalGenomes = genome2Int.size(); + + //create a square 2-d matrix + std::vector< std::vector > fastANI_matrix (totalGenomes, std::vector (totalGenomes, 0.0)); + + //transform FastANI results into 3-tuples + for(auto &e : CGI_ResultsVector) + { + if(e.countSeq >= parameters.minFragments) + { + int qGenome = genome2Int [ parameters.querySequences[e.qryGenomeId] ]; + int rGenome = genome2Int [ parameters.refSequences[e.refGenomeId] ]; + + if (qGenome != rGenome) //ignore if both genomes are same + { + if (qGenome > rGenome) + { + if (fastANI_matrix[qGenome][rGenome] > 0) + fastANI_matrix[qGenome][rGenome] = (fastANI_matrix[qGenome][rGenome] + e.identity)/2; + else + fastANI_matrix[qGenome][rGenome] = e.identity; + } + else + { + if (fastANI_matrix[rGenome][qGenome] > 0) + fastANI_matrix[rGenome][qGenome] = (fastANI_matrix[rGenome][qGenome] + e.identity)/2; + else + fastANI_matrix[rGenome][qGenome] = e.identity; + } + } + } + } + + std::ofstream outstrm(fileName + ".matrix"); + + outstrm << totalGenomes << "\n"; + + //Report matrix + for (int i = 0; i < totalGenomes; i++) + { + //output genome name + outstrm << genome2Int_rev[i]; + + for (int j = 0; j < i; j++) + { + //output ani values + //average if computed twice + std::string val = fastANI_matrix[i][j] > 0.0 ? std::to_string (fastANI_matrix[i][j]) : "NA"; + outstrm << "\t" << val; + } + outstrm << "\n"; + } + + outstrm.close(); + } } #endif diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index e10ccd8..41dba83 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -33,6 +33,7 @@ namespace skch std::string outFileName; //output file name bool reportAll; //Report all alignments if this is true bool visualize; //Visualize the conserved regions of two genomes + bool matrixOutput; //report fastani results as lower triangular matrix }; } diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 88201ee..45792e0 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -49,7 +49,7 @@ Example usage: \n\ cmd.defineOption("queryList", "a file containing list of query genome files, one genome per line", ArgvParser::OptionRequiresValue); cmd.defineOptionAlternative("queryList","ql"); - cmd.defineOption("kmer", "kmer size <= 16 [default 16]", ArgvParser::OptionRequiresValue); + cmd.defineOption("kmer", "kmer size <= 16 [default : 16]", ArgvParser::OptionRequiresValue); cmd.defineOptionAlternative("kmer","k"); cmd.defineOption("fragLen", "fragment length [default : 3,000]", ArgvParser::OptionRequiresValue); @@ -58,6 +58,8 @@ Example usage: \n\ cmd.defineOption("visualize", "output mappings for visualization, can be enabled for single genome to single genome comparison only [disabled by default]"); + cmd.defineOption("matrix", "also output ANI values as lower triangular matrix (format inspired from phylip). If enabled, you should expect an output file with .matrix extension [disabled by default]"); + cmd.defineOption("output", "output file name", ArgvParser::OptionRequired | ArgvParser::OptionRequiresValue); cmd.defineOptionAlternative("output","o"); } @@ -238,6 +240,12 @@ Example usage: \n\ else parameters.visualize = false; + + if(cmd.foundOption("matrix")) + parameters.matrixOutput = true; + else + parameters.matrixOutput = false; + //Parse algorithm parameters if(cmd.foundOption("kmer")) {