Skip to content

Commit

Permalink
command-line option for removing gap links from the output
Browse files Browse the repository at this point in the history
  • Loading branch information
lomereiter committed Apr 8, 2020
1 parent 39f467a commit d495ce0
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 14 deletions.
50 changes: 44 additions & 6 deletions src/algorithms/bin_path_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ void bin_path_info(const PathHandleGraph& graph,
const std::function<void(const uint64_t&, const std::string&)>& handle_sequence,
const std::function<void(const string&)>& handle_fasta,
uint64_t num_bins,
uint64_t bin_width) {
uint64_t bin_width,
bool drop_gap_links) {
// the graph must be compacted for this to work
std::vector<uint64_t> position_map(graph.get_node_count()+1);
uint64_t len = 0;
Expand Down Expand Up @@ -40,6 +41,7 @@ void bin_path_info(const PathHandleGraph& graph,
handle_fasta(graph_seq);
graph_seq.clear(); // clean up
std::unordered_map<path_handle_t, uint64_t> path_length;
uint64_t gap_links_removed = 0;
graph.for_each_path_handle([&](const path_handle_t& path) {
std::vector<std::pair<uint64_t, uint64_t>> links;
std::map<uint64_t, path_info_t> bins;
Expand Down Expand Up @@ -69,26 +71,62 @@ void bin_path_info(const PathHandleGraph& graph,
}
bins[curr_bin].mean_pos += path_pos++;
nucleotide_count += 1;
if(bins[curr_bin].first_nucleotide == 0){
bins[curr_bin].first_nucleotide = nucleotide_count;
}
bins[curr_bin].last_nucleotide = nucleotide_count;
if(bins[curr_bin].first_nucleotide == 0){
bins[curr_bin].first_nucleotide = nucleotide_count;
}
bins[curr_bin].last_nucleotide = nucleotide_count;
last_bin = curr_bin;
last_is_rev = is_rev;
last_pos_in_bin = curr_pos_in_bin;
}
});
links.push_back(std::make_pair(last_bin,0));
uint64_t path_length = path_pos;
uint64_t end_nucleotide = nucleotide_count;
uint64_t end_nucleotide = nucleotide_count;
for (auto& entry : bins) {
auto& v = entry.second;
v.mean_inv /= (v.mean_cov ? v.mean_cov : 1);
v.mean_cov /= bin_width;
v.mean_pos /= bin_width * path_length * v.mean_cov;
}

if (drop_gap_links) {
std::vector<uint64_t> bin_ids;
for (const auto &entry: bins) {
bin_ids.push_back(entry.first);
}
std::sort(bin_ids.begin(), bin_ids.end());

uint64_t fill_pos = 0;

for (uint64_t i = 0; i < links.size(); ++i) {
auto link = links[i];

if (link.first == 0 || link.second == 0)
continue;

if (link.first > link.second) {
links[fill_pos++] = link;
continue;
}

auto left_it = std::lower_bound(bin_ids.begin(), bin_ids.end(), link.first + 1);
auto right_it = std::lower_bound(bin_ids.begin(), bin_ids.end(), link.second);
if (right_it > left_it) {
links[fill_pos++] = link;
}
}

gap_links_removed += links.size() - fill_pos;
links.resize(fill_pos);
}

handle_path(graph.get_path_name(path), links, bins);
});

if (drop_gap_links) {
std::cerr << "Gap links removed: " << gap_links_removed << std::endl;
}
}

}
Expand Down
3 changes: 2 additions & 1 deletion src/algorithms/bin_path_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ namespace odgi {
const std::function<void(const uint64_t &, const std::string &)> &handle_sequence,
const std::function<void(const std::string&)> &handle_fasta,
uint64_t num_bins = 0,
uint64_t bin_width = 0);
uint64_t bin_width = 0,
bool drop_gap_links = false);
}
}
15 changes: 8 additions & 7 deletions src/subcommand/bin_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ int main_bin(int argc, char** argv) {
std::string prog_name = "odgi bin";
argv[0] = (char*)prog_name.c_str();
--argc;

args::ArgumentParser parser("binning of path information in the graph");
args::HelpFlag help(parser, "help", "display this help summary", {'h', "help"});
args::ValueFlag<std::string> dg_out_file(parser, "FILE", "store the graph in this file", {'o', "out"});
Expand All @@ -29,6 +29,7 @@ int main_bin(int argc, char** argv) {
args::ValueFlag<uint64_t> num_bins(parser, "N", "number of bins", {'n', "num-bins"});
args::ValueFlag<uint64_t> bin_width(parser, "bp", "width of each bin in basepairs along the graph vector", {'w', "bin-width"});
args::Flag write_seqs_not(parser, "write-seqs-not", "don't write out the sequences for each bin", {'s', "no-seqs"});
args::Flag drop_gap_links(parser, "drop-gap-links", "don't include gap links in the output", {'g', "no-gap-links"});
try {
parser.ParseCLI(argc, argv);
} catch (args::Help) {
Expand Down Expand Up @@ -151,8 +152,8 @@ int main_bin(int argc, char** argv) {
<< info.mean_cov << ","
<< info.mean_inv << ","
<< info.mean_pos << ","
<< info.first_nucleotide << ","
<< info.last_nucleotide << "]";
<< info.first_nucleotide << ","
<< info.last_nucleotide << "]";
if (i+1 != bins.size()) {
std::cout << ",";
}
Expand Down Expand Up @@ -192,16 +193,16 @@ int main_bin(int argc, char** argv) {
<< info.mean_cov << "\t"
<< info.mean_inv << "\t"
<< info.mean_pos << "\t"
<< info.first_nucleotide << "\t"
<< info.last_nucleotide << std::endl;
<< info.first_nucleotide << "\t"
<< info.last_nucleotide << std::endl;
}
}
};

if (args::get(output_json)) {
algorithms::bin_path_info(graph, (args::get(aggregate_delim) ? args::get(path_delim) : ""),
write_header_json,write_json, write_seq_json, write_fasta,
args::get(num_bins), args::get(bin_width));
args::get(num_bins), args::get(bin_width), args::get(drop_gap_links));
} else {
std::cout << "path.name" << "\t"
<< "path.prefix" << "\t"
Expand All @@ -214,7 +215,7 @@ int main_bin(int argc, char** argv) {
<< "last.nucl" << std::endl;
algorithms::bin_path_info(graph, (args::get(aggregate_delim) ? args::get(path_delim) : ""),
write_header_tsv,write_tsv, write_seq_noop, write_fasta,
args::get(num_bins), args::get(bin_width));
args::get(num_bins), args::get(bin_width), args::get(drop_gap_links));
}
return 0;
}
Expand Down

0 comments on commit d495ce0

Please sign in to comment.