diff --git a/CMakeLists.txt b/CMakeLists.txt index fd529b3e..850d6a8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -482,6 +482,7 @@ add_library(odgi_objs OBJECT ${CMAKE_SOURCE_DIR}/src/subcommand/tips_main.cpp ${CMAKE_SOURCE_DIR}/src/subcommand/stepindex_main.cpp ${CMAKE_SOURCE_DIR}/src/subcommand/heaps_main.cpp + ${CMAKE_SOURCE_DIR}/src/subcommand/bubble_main.cpp ${CMAKE_SOURCE_DIR}/src/subcommand/pav_main.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/topological_sort.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/kmer.cpp @@ -535,6 +536,7 @@ add_library(odgi_objs OBJECT ${CMAKE_SOURCE_DIR}/src/algorithms/groom.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/crush_n.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/heaps.cpp + ${CMAKE_SOURCE_DIR}/src/algorithms/bubble.cpp ${CMAKE_SOURCE_DIR}/src/unittest/edge.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/tips.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/path_jaccard.cpp diff --git a/src/algorithms/bubble.cpp b/src/algorithms/bubble.cpp new file mode 100644 index 00000000..9af209cb --- /dev/null +++ b/src/algorithms/bubble.cpp @@ -0,0 +1,179 @@ +#include "bubble.hpp" + +namespace odgi { + +namespace algorithms { + +uint64_t hash_vec(std::vector const& vec) { + uint64_t seed = vec.size(); + for(auto& i : vec) { + seed ^= i + 0x9e3779b97f4a7c15 + (seed << 6) + (seed >> 2); + } + return seed; +} + +size_t hash_steps_on_handle(const PathHandleGraph& graph, + const handle_t& handle) { + std::vector v; + graph.for_each_step_on_handle( + handle, + [&](const step_handle_t& step) { + v.push_back(as_integer(graph.get_path_handle_of_step(step))); + }); + std::sort(v.begin(), v.end()); + return hash_vec(v); +} + +void for_each_bubble(const PathHandleGraph& graph, + const step_index_t& step_index, + const path_handle_t& path, + const std::function& func) { + + std::vector step_hashes; + uint64_t pos = 0; + graph.for_each_step_in_path( + path, [&](const step_handle_t& step) { + auto handle = graph.get_handle_of_step(step); + auto depth = graph.get_step_count(handle); + auto length = graph.get_length(handle); + step_hashes.push_back( + { step, + pos, + length, + depth, + hash_steps_on_handle( + graph, + handle) + }); + pos += length; + }); + auto j = step_hashes.begin(); + std::vector bubbles; + uint64_t max_bubble = 1e6; + for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) { + auto n = i; + uint64_t distance = 0; + while (++n != step_hashes.end() && n != j && distance < max_bubble) { + distance += n->length; + if (n->hash == i->hash) { + //bubbles.push_back({&*i, &*n}); + i->other_tail = &*n; + n->other_head = &*i; + j = n; + break; + } + } + } + + /* + for (auto& bubble : bubbles) { + std::cout << bubble.head->pos << "@" << bubble.head->depth << " " + << bubble.tail->pos << "@" << bubble.tail->depth << std::endl; + } + */ + + for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) { + if (i->other_tail == nullptr || !i->other_tail->is_head_tail()) { // tail is filtered + uint64_t distance = 0; + //auto n = (std::vector::iterator)i->other_tail; + auto n = i; + while (++n != step_hashes.end() && distance < max_bubble) { + distance += n->length; + if (n->hash == i->hash + && n->other_head == nullptr + && n->other_tail != nullptr) { + bubbles.push_back({&*i, &*n}); + i->other_tail = &*n; + n->other_head = &*i; + break; + } + } + } else { + auto n = i->other_tail; + bubbles.push_back({&*i, &*n}); + } + } + + for (auto i = step_hashes.rbegin(); i != step_hashes.rend(); ++i) { + if (i->other_head == nullptr || !i->other_head->is_head_tail()) { // tail is filtered + uint64_t distance = 0; + //auto n = (std::vector::iterator)i->other_tail; + auto n = i; + while (++n != step_hashes.rend() && distance < max_bubble) { + distance += n->length; + if (n->hash == i->hash + && n->other_tail == nullptr + && n->other_head != nullptr) { + bubbles.push_back({&*n, &*i}); + i->other_head = &*n; + n->other_tail = &*i; + break; + } + } + } else { + auto n = i->other_head; + bubbles.push_back({&*n, &*i}); + } + } + + /* + std::vector chains; + std::vector seen(step_hashes.size()); + for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) { + chains.emplace_back(); + auto& c = chains.back().bubbles; + // follow the chain to the end + // + } + */ + + // to filter the bubbles to keep those whose head is the tail of another and vice-versa + /* + bubbles.erase( + std::remove_if( + bubbles.begin(), + bubbles.end(), + [](const bubble_t& bubble) { + return !(bubble.head->is_head_tail() + && bubble.tail->is_head_tail()); + }), + bubbles.end()); + */ + + //if (false) { + // now draw chains + /* + std::cout << "digraph graphname {\n" + << "node [shape=plaintext];\n"; + //<< "rankdir=LR;\n"; + for (auto& bubble : bubbles) { + std::cout << "\"" << bubble.head->pos << "\" [label=\"" << bubble.head->pos + << "@" << bubble.head->depth << "x\"];\n"; + } + for (auto& bubble : bubbles) { + std::cout << "\"" << bubble.head->pos << "\" -> \"" << bubble.tail->pos << "\";\n"; + //std::cout << "bubble " << bubble.head->pos << " " << bubble.head->depth << " -> " << bubble.tail->pos << " " << bubble.tail->depth << std::endl; + func(bubble); + } + std::cout << "}\n"; + */ + //} + std::cout << "digraph graphname {\n" + << "node [shape=plaintext];\n"; + for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) { + std::cout << "\"" << i->pos << "\" [label=\"" << i->pos + << "@" << i->depth << "x\"];\n"; + } + for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) { + std::cout << "\"" << (i->other_head != nullptr ? std::to_string(i->other_head->pos) : "null" ) + << "\" -> \"" << i->pos << "\";\n"; + std::cout << "\"" << i->pos << "\" -> \"" + << (i->other_tail != nullptr ? std::to_string(i->other_tail->pos) : "null" ) << "\";\n"; + } + std::cout << "}\n"; + +} + +} + +} diff --git a/src/algorithms/bubble.hpp b/src/algorithms/bubble.hpp new file mode 100644 index 00000000..b493a6b6 --- /dev/null +++ b/src/algorithms/bubble.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "hash_map.hpp" +#include "position.hpp" +#include "stepindex.hpp" +#include +#include +#include +#include +#include +#include + +namespace odgi { + +using namespace handlegraph; + +namespace algorithms { + +struct step_hash_t { + step_handle_t step; + uint64_t pos; + uint64_t length; + uint64_t depth; + uint64_t hash; + step_hash_t* other_tail = nullptr; + step_hash_t* other_head = nullptr; + bool is_head_tail(void) { return other_head != nullptr && other_tail != nullptr; }; +}; + +struct bubble_t { + step_hash_t* head; + step_hash_t* tail; +}; + +struct bubble_chain_t { + std::vector bubbles; +}; + +void for_each_bubble(const PathHandleGraph& graph, + const step_index_t& step_index, + const path_handle_t& path, + const std::function& func); + +} + +} diff --git a/src/subcommand/bubble_main.cpp b/src/subcommand/bubble_main.cpp new file mode 100644 index 00000000..981833aa --- /dev/null +++ b/src/subcommand/bubble_main.cpp @@ -0,0 +1,104 @@ +#include "subcommand.hpp" +#include "odgi.hpp" +#include "args.hxx" +#include +#include "algorithms/bubble.hpp" +#include "algorithms/stepindex.hpp" +#include "utils.hpp" +#include "split.hpp" + +namespace odgi { + +using namespace odgi::subcommand; + +int main_bubble(int argc, char **argv) { + + // trick argumentparser to do the right thing with the subcommand + for (uint64_t i = 1; i < argc - 1; ++i) { + argv[i] = argv[i + 1]; + } + const std::string prog_name = "odgi bubble"; + argv[0] = (char *) prog_name.c_str(); + --argc +; + args::ArgumentParser parser("Path-based bubble detection and VCF projection."); + args::Group mandatory_opts(parser, "[ MANDATORY ARGUMENTS ]"); + args::ValueFlag og_in_file(mandatory_opts, "FILE", "Load the succinct variation graph in ODGI format from this *FILE*. The file name usually ends with *.og*. It also accepts GFAv1, but the on-the-fly conversion to the ODGI format requires additional time!", {'i', "idx"}); + args::Group bubble_opts(parser, "[ Bubble Options ]"); + args::ValueFlag _target_path(bubble_opts, "NAME", "Target (reference) path name.", {'p', "target-path"}); +// args::ValueFlag _path_groups(bubble_opts, "FILE", "Group paths as described in two-column FILE, with columns path.name and group.name.", +// {'p', "path-groups"}); +// args::Flag group_by_sample(bubble_opts, "bool", "Following PanSN naming (sample#hap#ctg), group by sample (1st field).", {'S', "group-by-sample"}); +// args::Flag group_by_haplotype(bubble_opts, "bool", "Following PanSN naming (sample#hap#ctg), group by haplotype (2nd field).", {'H', "group-by-haplotype"}); +// args::ValueFlag _bed_targets(bubble_opts, "FILE", "BED file over path space of the graph, describing a subset of the graph to consider.", +// {'b', "bed-targets"}); +// args::ValueFlag _n_permutations(bubble_opts, "N", "Number of permutations to run.", +// {'n', "n-permutations"}); + args::Group threading_opts(parser, "[ Threading ]"); + args::ValueFlag nthreads(threading_opts, "N", "Number of threads to use for parallel operations.", + {'t', "threads"}); + args::Group processing_info_opts(parser, "[ Processing Information ]"); + //args::Flag debug(processing_info_opts, "debug", "Print information about the process to stderr.", {'d', "debug"}); + args::Flag progress(processing_info_opts, "progress", "Write the current progress to stderr.", {'P', "progress"}); + args::Group program_info_opts(parser, "[ Program Information ]"); + args::HelpFlag help(program_info_opts, "help", "Print a help message for odgi bubble.", {'h', "help"}); + try { + parser.ParseCLI(argc, argv); + } catch (args::Help) { + std::cout << parser; + return 0; + } catch (args::ParseError e) { + std::cerr << e.what() << std::endl; + std::cerr << parser; + return 1; + } + if (argc == 1) { + std::cout << parser; + return 1; + } + + if (!og_in_file) { + std::cerr + << "[odgi::bubble] error: please specify an input file from where to load the graph via -i=[FILE], --idx=[FILE]." + << std::endl; + return 1; + } + + const uint64_t num_threads = args::get(nthreads) ? args::get(nthreads) : 1; + omp_set_num_threads(num_threads); + + graph_t graph; + assert(argc > 0); + { + const std::string infile = args::get(og_in_file); + if (!infile.empty()) { + if (infile == "-") { + graph.deserialize(std::cin); + } else { + utils::handle_gfa_odgi_input(infile, "bubble", args::get(progress), num_threads, graph); + } + } + } + + graph.set_number_of_threads(num_threads); + + auto target_path = graph.get_path_handle(args::get(_target_path)); + + algorithms::step_index_t step_index(graph, {target_path}, num_threads, true, 8); + //step_index.save(step_index_out_file); + + //int last_depth = 0; + auto handle_output = [&](const algorithms::bubble_t& bubble) { + // + }; + + algorithms::for_each_bubble(graph, step_index, target_path, handle_output); + + return 0; +} + +static Subcommand odgi_bubble("bubble", "variant detection using linearized bubbles.", + PIPELINE, 3, main_bubble); + + +} diff --git a/src/subcommand/stepindex_main.cpp b/src/subcommand/stepindex_main.cpp index 39bc96ed..ce482199 100644 --- a/src/subcommand/stepindex_main.cpp +++ b/src/subcommand/stepindex_main.cpp @@ -96,4 +96,4 @@ namespace odgi { "Generate a step index and access the position of each step of each path once.", PIPELINE, 3, main_stepindex); -} \ No newline at end of file +}