From d495ce0dbb7871ee2c76a88ff504364bbb0704d5 Mon Sep 17 00:00:00 2001 From: Artem Tarasov Date: Wed, 8 Apr 2020 19:26:32 +0200 Subject: [PATCH] command-line option for removing gap links from the output --- src/algorithms/bin_path_info.cpp | 50 ++++++++++++++++++++++++++++---- src/algorithms/bin_path_info.hpp | 3 +- src/subcommand/bin_main.cpp | 15 +++++----- 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/src/algorithms/bin_path_info.cpp b/src/algorithms/bin_path_info.cpp index c1723033..e73c0c23 100644 --- a/src/algorithms/bin_path_info.cpp +++ b/src/algorithms/bin_path_info.cpp @@ -12,7 +12,8 @@ void bin_path_info(const PathHandleGraph& graph, const std::function& handle_sequence, const std::function& handle_fasta, uint64_t num_bins, - uint64_t bin_width) { + uint64_t bin_width, + bool drop_gap_links) { // the graph must be compacted for this to work std::vector position_map(graph.get_node_count()+1); uint64_t len = 0; @@ -40,6 +41,7 @@ void bin_path_info(const PathHandleGraph& graph, handle_fasta(graph_seq); graph_seq.clear(); // clean up std::unordered_map path_length; + uint64_t gap_links_removed = 0; graph.for_each_path_handle([&](const path_handle_t& path) { std::vector> links; std::map bins; @@ -69,10 +71,10 @@ void bin_path_info(const PathHandleGraph& graph, } bins[curr_bin].mean_pos += path_pos++; nucleotide_count += 1; - if(bins[curr_bin].first_nucleotide == 0){ - bins[curr_bin].first_nucleotide = nucleotide_count; - } - bins[curr_bin].last_nucleotide = nucleotide_count; + if(bins[curr_bin].first_nucleotide == 0){ + bins[curr_bin].first_nucleotide = nucleotide_count; + } + bins[curr_bin].last_nucleotide = nucleotide_count; last_bin = curr_bin; last_is_rev = is_rev; last_pos_in_bin = curr_pos_in_bin; @@ -80,15 +82,51 @@ void bin_path_info(const PathHandleGraph& graph, }); links.push_back(std::make_pair(last_bin,0)); uint64_t path_length = path_pos; - uint64_t end_nucleotide = nucleotide_count; + uint64_t end_nucleotide = nucleotide_count; for (auto& entry : bins) { auto& v = entry.second; v.mean_inv /= (v.mean_cov ? v.mean_cov : 1); v.mean_cov /= bin_width; v.mean_pos /= bin_width * path_length * v.mean_cov; } + + if (drop_gap_links) { + std::vector bin_ids; + for (const auto &entry: bins) { + bin_ids.push_back(entry.first); + } + std::sort(bin_ids.begin(), bin_ids.end()); + + uint64_t fill_pos = 0; + + for (uint64_t i = 0; i < links.size(); ++i) { + auto link = links[i]; + + if (link.first == 0 || link.second == 0) + continue; + + if (link.first > link.second) { + links[fill_pos++] = link; + continue; + } + + auto left_it = std::lower_bound(bin_ids.begin(), bin_ids.end(), link.first + 1); + auto right_it = std::lower_bound(bin_ids.begin(), bin_ids.end(), link.second); + if (right_it > left_it) { + links[fill_pos++] = link; + } + } + + gap_links_removed += links.size() - fill_pos; + links.resize(fill_pos); + } + handle_path(graph.get_path_name(path), links, bins); }); + + if (drop_gap_links) { + std::cerr << "Gap links removed: " << gap_links_removed << std::endl; + } } } diff --git a/src/algorithms/bin_path_info.hpp b/src/algorithms/bin_path_info.hpp index d0e6200d..1b16b8ca 100644 --- a/src/algorithms/bin_path_info.hpp +++ b/src/algorithms/bin_path_info.hpp @@ -36,6 +36,7 @@ namespace odgi { const std::function &handle_sequence, const std::function &handle_fasta, uint64_t num_bins = 0, - uint64_t bin_width = 0); + uint64_t bin_width = 0, + bool drop_gap_links = false); } } diff --git a/src/subcommand/bin_main.cpp b/src/subcommand/bin_main.cpp index e4320746..01aaf088 100644 --- a/src/subcommand/bin_main.cpp +++ b/src/subcommand/bin_main.cpp @@ -17,7 +17,7 @@ int main_bin(int argc, char** argv) { std::string prog_name = "odgi bin"; argv[0] = (char*)prog_name.c_str(); --argc; - + args::ArgumentParser parser("binning of path information in the graph"); args::HelpFlag help(parser, "help", "display this help summary", {'h', "help"}); args::ValueFlag dg_out_file(parser, "FILE", "store the graph in this file", {'o', "out"}); @@ -29,6 +29,7 @@ int main_bin(int argc, char** argv) { args::ValueFlag num_bins(parser, "N", "number of bins", {'n', "num-bins"}); args::ValueFlag bin_width(parser, "bp", "width of each bin in basepairs along the graph vector", {'w', "bin-width"}); args::Flag write_seqs_not(parser, "write-seqs-not", "don't write out the sequences for each bin", {'s', "no-seqs"}); + args::Flag drop_gap_links(parser, "drop-gap-links", "don't include gap links in the output", {'g', "no-gap-links"}); try { parser.ParseCLI(argc, argv); } catch (args::Help) { @@ -151,8 +152,8 @@ int main_bin(int argc, char** argv) { << info.mean_cov << "," << info.mean_inv << "," << info.mean_pos << "," - << info.first_nucleotide << "," - << info.last_nucleotide << "]"; + << info.first_nucleotide << "," + << info.last_nucleotide << "]"; if (i+1 != bins.size()) { std::cout << ","; } @@ -192,8 +193,8 @@ int main_bin(int argc, char** argv) { << info.mean_cov << "\t" << info.mean_inv << "\t" << info.mean_pos << "\t" - << info.first_nucleotide << "\t" - << info.last_nucleotide << std::endl; + << info.first_nucleotide << "\t" + << info.last_nucleotide << std::endl; } } }; @@ -201,7 +202,7 @@ int main_bin(int argc, char** argv) { if (args::get(output_json)) { algorithms::bin_path_info(graph, (args::get(aggregate_delim) ? args::get(path_delim) : ""), write_header_json,write_json, write_seq_json, write_fasta, - args::get(num_bins), args::get(bin_width)); + args::get(num_bins), args::get(bin_width), args::get(drop_gap_links)); } else { std::cout << "path.name" << "\t" << "path.prefix" << "\t" @@ -214,7 +215,7 @@ int main_bin(int argc, char** argv) { << "last.nucl" << std::endl; algorithms::bin_path_info(graph, (args::get(aggregate_delim) ? args::get(path_delim) : ""), write_header_tsv,write_tsv, write_seq_noop, write_fasta, - args::get(num_bins), args::get(bin_width)); + args::get(num_bins), args::get(bin_width), args::get(drop_gap_links)); } return 0; }