From 2ea72d5e55dabaca2df7770997512a020f6ad35c Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 30 Mar 2020 16:36:04 +0200 Subject: [PATCH 1/2] this should resolve #88 --- src/algorithms/bin_path_info.cpp | 3 +++ src/algorithms/bin_path_info.hpp | 1 + src/subcommand/bin_main.cpp | 28 ++++++++++++++++++++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/algorithms/bin_path_info.cpp b/src/algorithms/bin_path_info.cpp index 9d67f34d..c1723033 100644 --- a/src/algorithms/bin_path_info.cpp +++ b/src/algorithms/bin_path_info.cpp @@ -10,6 +10,7 @@ void bin_path_info(const PathHandleGraph& graph, const std::vector>&, const std::map&)>& handle_path, const std::function& handle_sequence, + const std::function& handle_fasta, uint64_t num_bins, uint64_t bin_width) { // the graph must be compacted for this to work @@ -35,6 +36,8 @@ void bin_path_info(const PathHandleGraph& graph, for (uint64_t i = 0; i < num_bins; ++i) { handle_sequence(i+1, graph_seq.substr(i*bin_width, bin_width)); } + // write out pangenome sequence if wished so + handle_fasta(graph_seq); graph_seq.clear(); // clean up std::unordered_map path_length; graph.for_each_path_handle([&](const path_handle_t& path) { diff --git a/src/algorithms/bin_path_info.hpp b/src/algorithms/bin_path_info.hpp index 08fd67bd..d0e6200d 100644 --- a/src/algorithms/bin_path_info.hpp +++ b/src/algorithms/bin_path_info.hpp @@ -34,6 +34,7 @@ namespace odgi { const std::vector> &, const std::map &)> &handle_path, const std::function &handle_sequence, + const std::function &handle_fasta, uint64_t num_bins = 0, uint64_t bin_width = 0); } diff --git a/src/subcommand/bin_main.cpp b/src/subcommand/bin_main.cpp index 55c58c4d..5c370e13 100644 --- a/src/subcommand/bin_main.cpp +++ b/src/subcommand/bin_main.cpp @@ -3,6 +3,8 @@ #include "args.hxx" #include "algorithms/bin_path_info.hpp" +#include + namespace odgi { using namespace odgi::subcommand; @@ -20,6 +22,7 @@ int main_bin(int argc, char** argv) { args::HelpFlag help(parser, "help", "display this help summary", {'h', "help"}); args::ValueFlag dg_out_file(parser, "FILE", "store the graph in this file", {'o', "out"}); args::ValueFlag dg_in_file(parser, "FILE", "load the graph from this file", {'i', "idx"}); + args::ValueFlag fa_out_file(parser, "FILE", "store the pangenome sequence in FASTA format in this file", {'f', "fasta"}); args::ValueFlag path_delim(parser, "path-delim", "annotate rows by prefix and suffix of this delimiter", {'D', "path-delim"}); args::Flag output_json(parser, "write-json", "write JSON format output including additional path positional information", {'j', "json"}); args::Flag aggregate_delim(parser, "aggregate-delim", "aggregate on path prefix delimiter", {'a', "aggregate-delim"}); @@ -105,6 +108,27 @@ int main_bin(int argc, char** argv) { } }; + std::function write_fasta + = [&](const std::string& nuc_seq) { + if (fa_out_file) { + std::ofstream out(args::get(fa_out_file)); + std::string fa_out_name = args::get(fa_out_file).c_str(); + std::regex regex("/"); + std::vector splitted( + std::sregex_token_iterator(fa_out_name.begin(), fa_out_name.end(), regex, -1), + std::sregex_token_iterator() + ); + fa_out_name = splitted[splitted.size() - 1]; + // Write header + out << ">" << fa_out_name << std::endl; + // Write the actual sequences, 80 nucleotides per line + for (unsigned i = 0; i < nuc_seq.length(); i += 80) { + std:: string sub_nuc_seq = nuc_seq.substr(i, 80); + out << sub_nuc_seq << std::endl; + } + } + }; + std::function>&, const std::map&)> write_json @@ -176,7 +200,7 @@ int main_bin(int argc, char** argv) { if (args::get(output_json)) { algorithms::bin_path_info(graph, (args::get(aggregate_delim) ? args::get(path_delim) : ""), - write_header_json,write_json, write_seq_json, + write_header_json,write_json, write_seq_json, write_fasta, args::get(num_bins), args::get(bin_width)); } else { std::cout << "path.name" << "\t" @@ -189,7 +213,7 @@ int main_bin(int argc, char** argv) { << "first.nucl" << "\t" << "last.nucl" << std::endl; algorithms::bin_path_info(graph, (args::get(aggregate_delim) ? args::get(path_delim) : ""), - write_header_tsv,write_tsv, write_seq_noop, + write_header_tsv,write_tsv, write_seq_noop, write_fasta, args::get(num_bins), args::get(bin_width)); } return 0; From 610165007052bd3defadf12b34ba9e36ddb5311e Mon Sep 17 00:00:00 2001 From: subwaystation Date: Tue, 31 Mar 2020 10:53:58 +0200 Subject: [PATCH 2/2] if -f,--fasta is specified no sequence will be written to the bin file --- src/subcommand/bin_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/bin_main.cpp b/src/subcommand/bin_main.cpp index 5c370e13..e4320746 100644 --- a/src/subcommand/bin_main.cpp +++ b/src/subcommand/bin_main.cpp @@ -100,7 +100,7 @@ int main_bin(int argc, char** argv) { std::function write_seq_json = [&](const uint64_t& bin_id, const std::string& seq) { - if (args::get(write_seqs_not)) { + if (args::get(write_seqs_not) || fa_out_file) { std::cout << "{\"bin_id\":" << bin_id << "}" << std::endl; } else { std::cout << "{\"bin_id\":" << bin_id << ","