Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

the bubbles #382

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ add_library(odgi_objs OBJECT
${CMAKE_SOURCE_DIR}/src/subcommand/tips_main.cpp
${CMAKE_SOURCE_DIR}/src/subcommand/stepindex_main.cpp
${CMAKE_SOURCE_DIR}/src/subcommand/heaps_main.cpp
${CMAKE_SOURCE_DIR}/src/subcommand/bubble_main.cpp
${CMAKE_SOURCE_DIR}/src/subcommand/pav_main.cpp
${CMAKE_SOURCE_DIR}/src/algorithms/topological_sort.cpp
${CMAKE_SOURCE_DIR}/src/algorithms/kmer.cpp
Expand Down Expand Up @@ -535,6 +536,7 @@ add_library(odgi_objs OBJECT
${CMAKE_SOURCE_DIR}/src/algorithms/groom.cpp
${CMAKE_SOURCE_DIR}/src/algorithms/crush_n.cpp
${CMAKE_SOURCE_DIR}/src/algorithms/heaps.cpp
${CMAKE_SOURCE_DIR}/src/algorithms/bubble.cpp
${CMAKE_SOURCE_DIR}/src/unittest/edge.cpp
${CMAKE_SOURCE_DIR}/src/algorithms/tips.cpp
${CMAKE_SOURCE_DIR}/src/algorithms/path_jaccard.cpp
Expand Down
179 changes: 179 additions & 0 deletions src/algorithms/bubble.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#include "bubble.hpp"

namespace odgi {

namespace algorithms {

uint64_t hash_vec(std::vector<uint64_t> const& vec) {
uint64_t seed = vec.size();
for(auto& i : vec) {
seed ^= i + 0x9e3779b97f4a7c15 + (seed << 6) + (seed >> 2);
}
return seed;
}

size_t hash_steps_on_handle(const PathHandleGraph& graph,
const handle_t& handle) {
std::vector<uint64_t> v;
graph.for_each_step_on_handle(
handle,
[&](const step_handle_t& step) {
v.push_back(as_integer(graph.get_path_handle_of_step(step)));
});
std::sort(v.begin(), v.end());
return hash_vec(v);
}

void for_each_bubble(const PathHandleGraph& graph,
const step_index_t& step_index,
const path_handle_t& path,
const std::function<void(const bubble_t& bubble)>& func) {

std::vector<step_hash_t> step_hashes;
uint64_t pos = 0;
graph.for_each_step_in_path(
path, [&](const step_handle_t& step) {
auto handle = graph.get_handle_of_step(step);
auto depth = graph.get_step_count(handle);
auto length = graph.get_length(handle);
step_hashes.push_back(
{ step,
pos,
length,
depth,
hash_steps_on_handle(
graph,
handle)
});
pos += length;
});
auto j = step_hashes.begin();
std::vector<bubble_t> bubbles;
uint64_t max_bubble = 1e6;
for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) {
auto n = i;
uint64_t distance = 0;
while (++n != step_hashes.end() && n != j && distance < max_bubble) {
distance += n->length;
if (n->hash == i->hash) {
//bubbles.push_back({&*i, &*n});
i->other_tail = &*n;
n->other_head = &*i;
j = n;
break;
}
}
}

/*
for (auto& bubble : bubbles) {
std::cout << bubble.head->pos << "@" << bubble.head->depth << " "
<< bubble.tail->pos << "@" << bubble.tail->depth << std::endl;
}
*/

for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) {
if (i->other_tail == nullptr || !i->other_tail->is_head_tail()) { // tail is filtered
uint64_t distance = 0;
//auto n = (std::vector<step_hash_t>::iterator)i->other_tail;
auto n = i;
while (++n != step_hashes.end() && distance < max_bubble) {
distance += n->length;
if (n->hash == i->hash
&& n->other_head == nullptr
&& n->other_tail != nullptr) {
bubbles.push_back({&*i, &*n});
i->other_tail = &*n;
n->other_head = &*i;
break;
}
}
} else {
auto n = i->other_tail;
bubbles.push_back({&*i, &*n});
}
}

for (auto i = step_hashes.rbegin(); i != step_hashes.rend(); ++i) {
if (i->other_head == nullptr || !i->other_head->is_head_tail()) { // tail is filtered
uint64_t distance = 0;
//auto n = (std::vector<step_hash_t>::iterator)i->other_tail;
auto n = i;
while (++n != step_hashes.rend() && distance < max_bubble) {
distance += n->length;
if (n->hash == i->hash
&& n->other_tail == nullptr
&& n->other_head != nullptr) {
bubbles.push_back({&*n, &*i});
i->other_head = &*n;
n->other_tail = &*i;
break;
}
}
} else {
auto n = i->other_head;
bubbles.push_back({&*n, &*i});
}
}

/*
std::vector<bubble_chain_t> chains;
std::vector<bool> seen(step_hashes.size());
for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) {
chains.emplace_back();
auto& c = chains.back().bubbles;
// follow the chain to the end
//
}
*/

// to filter the bubbles to keep those whose head is the tail of another and vice-versa
/*
bubbles.erase(
std::remove_if(
bubbles.begin(),
bubbles.end(),
[](const bubble_t& bubble) {
return !(bubble.head->is_head_tail()
&& bubble.tail->is_head_tail());
}),
bubbles.end());
*/

//if (false) {
// now draw chains
/*
std::cout << "digraph graphname {\n"
<< "node [shape=plaintext];\n";
//<< "rankdir=LR;\n";
for (auto& bubble : bubbles) {
std::cout << "\"" << bubble.head->pos << "\" [label=\"" << bubble.head->pos
<< "@" << bubble.head->depth << "x\"];\n";
}
for (auto& bubble : bubbles) {
std::cout << "\"" << bubble.head->pos << "\" -> \"" << bubble.tail->pos << "\";\n";
//std::cout << "bubble " << bubble.head->pos << " " << bubble.head->depth << " -> " << bubble.tail->pos << " " << bubble.tail->depth << std::endl;
func(bubble);
}
std::cout << "}\n";
*/
//}
std::cout << "digraph graphname {\n"
<< "node [shape=plaintext];\n";
for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) {
std::cout << "\"" << i->pos << "\" [label=\"" << i->pos
<< "@" << i->depth << "x\"];\n";
}
for (auto i = step_hashes.begin(); i != step_hashes.end(); ++i) {
std::cout << "\"" << (i->other_head != nullptr ? std::to_string(i->other_head->pos) : "null" )
<< "\" -> \"" << i->pos << "\";\n";
std::cout << "\"" << i->pos << "\" -> \""
<< (i->other_tail != nullptr ? std::to_string(i->other_tail->pos) : "null" ) << "\";\n";
}
std::cout << "}\n";

}

}

}
53 changes: 53 additions & 0 deletions src/algorithms/bubble.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#pragma once

#include <iostream>
#include <vector>
#include <map>
#include <set>
#include <algorithm>
#include <random>
#include <omp.h>
#include "hash_map.hpp"
#include "position.hpp"
#include "stepindex.hpp"
#include <handlegraph/types.hpp>
#include <handlegraph/iteratee.hpp>
#include <handlegraph/util.hpp>
#include <handlegraph/handle_graph.hpp>
#include <handlegraph/path_handle_graph.hpp>
#include <atomic_bitvector.hpp>

namespace odgi {

using namespace handlegraph;

namespace algorithms {

struct step_hash_t {
step_handle_t step;
uint64_t pos;
uint64_t length;
uint64_t depth;
uint64_t hash;
step_hash_t* other_tail = nullptr;
step_hash_t* other_head = nullptr;
bool is_head_tail(void) { return other_head != nullptr && other_tail != nullptr; };
};

struct bubble_t {
step_hash_t* head;
step_hash_t* tail;
};

struct bubble_chain_t {
std::vector<bubble_t> bubbles;
};

void for_each_bubble(const PathHandleGraph& graph,
const step_index_t& step_index,
const path_handle_t& path,
const std::function<void(const bubble_t& bubble)>& func);

}

}
104 changes: 104 additions & 0 deletions src/subcommand/bubble_main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include "subcommand.hpp"
#include "odgi.hpp"
#include "args.hxx"
#include <omp.h>
#include "algorithms/bubble.hpp"
#include "algorithms/stepindex.hpp"
#include "utils.hpp"
#include "split.hpp"

namespace odgi {

using namespace odgi::subcommand;

int main_bubble(int argc, char **argv) {

// trick argumentparser to do the right thing with the subcommand
for (uint64_t i = 1; i < argc - 1; ++i) {
argv[i] = argv[i + 1];
}
const std::string prog_name = "odgi bubble";
argv[0] = (char *) prog_name.c_str();
--argc
;
args::ArgumentParser parser("Path-based bubble detection and VCF projection.");
args::Group mandatory_opts(parser, "[ MANDATORY ARGUMENTS ]");
args::ValueFlag<std::string> og_in_file(mandatory_opts, "FILE", "Load the succinct variation graph in ODGI format from this *FILE*. The file name usually ends with *.og*. It also accepts GFAv1, but the on-the-fly conversion to the ODGI format requires additional time!", {'i', "idx"});
args::Group bubble_opts(parser, "[ Bubble Options ]");
args::ValueFlag<std::string> _target_path(bubble_opts, "NAME", "Target (reference) path name.", {'p', "target-path"});
// args::ValueFlag<std::string> _path_groups(bubble_opts, "FILE", "Group paths as described in two-column FILE, with columns path.name and group.name.",
// {'p', "path-groups"});
// args::Flag group_by_sample(bubble_opts, "bool", "Following PanSN naming (sample#hap#ctg), group by sample (1st field).", {'S', "group-by-sample"});
// args::Flag group_by_haplotype(bubble_opts, "bool", "Following PanSN naming (sample#hap#ctg), group by haplotype (2nd field).", {'H', "group-by-haplotype"});
// args::ValueFlag<std::string> _bed_targets(bubble_opts, "FILE", "BED file over path space of the graph, describing a subset of the graph to consider.",
// {'b', "bed-targets"});
// args::ValueFlag<uint64_t> _n_permutations(bubble_opts, "N", "Number of permutations to run.",
// {'n', "n-permutations"});
args::Group threading_opts(parser, "[ Threading ]");
args::ValueFlag<uint64_t> nthreads(threading_opts, "N", "Number of threads to use for parallel operations.",
{'t', "threads"});
args::Group processing_info_opts(parser, "[ Processing Information ]");
//args::Flag debug(processing_info_opts, "debug", "Print information about the process to stderr.", {'d', "debug"});
args::Flag progress(processing_info_opts, "progress", "Write the current progress to stderr.", {'P', "progress"});
args::Group program_info_opts(parser, "[ Program Information ]");
args::HelpFlag help(program_info_opts, "help", "Print a help message for odgi bubble.", {'h', "help"});
try {
parser.ParseCLI(argc, argv);
} catch (args::Help) {
std::cout << parser;
return 0;
} catch (args::ParseError e) {
std::cerr << e.what() << std::endl;
std::cerr << parser;
return 1;
}
if (argc == 1) {
std::cout << parser;
return 1;
}

if (!og_in_file) {
std::cerr
<< "[odgi::bubble] error: please specify an input file from where to load the graph via -i=[FILE], --idx=[FILE]."
<< std::endl;
return 1;
}

const uint64_t num_threads = args::get(nthreads) ? args::get(nthreads) : 1;
omp_set_num_threads(num_threads);

graph_t graph;
assert(argc > 0);
{
const std::string infile = args::get(og_in_file);
if (!infile.empty()) {
if (infile == "-") {
graph.deserialize(std::cin);
} else {
utils::handle_gfa_odgi_input(infile, "bubble", args::get(progress), num_threads, graph);
}
}
}

graph.set_number_of_threads(num_threads);

auto target_path = graph.get_path_handle(args::get(_target_path));

algorithms::step_index_t step_index(graph, {target_path}, num_threads, true, 8);
//step_index.save(step_index_out_file);

//int last_depth = 0;
auto handle_output = [&](const algorithms::bubble_t& bubble) {
//
};

algorithms::for_each_bubble(graph, step_index, target_path, handle_output);

return 0;
}

static Subcommand odgi_bubble("bubble", "variant detection using linearized bubbles.",
PIPELINE, 3, main_bubble);


}
2 changes: 1 addition & 1 deletion src/subcommand/stepindex_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,4 @@ namespace odgi {
"Generate a step index and access the position of each step of each path once.",
PIPELINE, 3, main_stepindex);

}
}