Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prevent quadratic untangle cut point explosions #474

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ endif()
# Set optimization through command line; see INSTALL.md
if (${CMAKE_BUILD_TYPE} MATCHES Release)
set(EXTRA_FLAGS "-Ofast -march=native -pipe -msse4.2 -funroll-all-loops") # -fprofile-generate=../pgo")
#set(EXTRA_FLAGS "-O0 -march=native -msse4.2 -pg") # for profiling with gprof
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG") # reset CXX_FLAGS to be able to replace -O3 with -Ofast
endif ()

Expand Down
12 changes: 6 additions & 6 deletions src/algorithms/stepindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,14 @@ path_step_index_t::path_step_index_t(const PathHandleGraph& graph,
node_offset[0] = 0; // first offset is 0 by definition
for (auto& node_step : steps_by_node) {
auto& idx = std::get<0>(node_step);
//auto& offset = std::get<1>(node_step); // just used for sorting
auto& offset = std::get<1>(node_step); // just used for sorting
auto& step = std::get<2>(node_step);
//std::cerr << "idx = " << idx << " " << as_integers(step)[0] << ":" << as_integers(step)[1] << std::endl;
if (idx != last_idx) {
node_offset[idx] = node_steps.size();
}
step_offset[step_mphf->lookup(step)] = node_steps.size();
node_steps.push_back(step);
node_steps.push_back(std::make_pair(step, offset));
last_idx = idx;
}
if (last_idx != node_count-1) {
Expand Down Expand Up @@ -313,7 +313,7 @@ uint64_t path_step_index_t::n_steps_on_node(const nid_t& id) const {
return node_offset[idx+1] - node_offset[idx];
}

std::pair<bool, step_handle_t>
std::pair<bool, std::pair<step_handle_t, uint64_t>>
path_step_index_t::get_next_step_on_node(const nid_t& id, const step_handle_t& step) const {
auto node_idx = get_node_idx(id);
auto curr_steps = node_offset[node_idx];
Expand All @@ -323,20 +323,20 @@ path_step_index_t::get_next_step_on_node(const nid_t& id, const step_handle_t& s
if (has_next) {
return std::make_pair(true, node_steps[step_idx+1]);
} else {
step_handle_t empty_step;
std::pair<step_handle_t,uint64_t> empty_step;
return std::make_pair(false, empty_step);
}
}

std::pair<bool, step_handle_t>
std::pair<bool, std::pair<step_handle_t, uint64_t>>
path_step_index_t::get_prev_step_on_node(const nid_t& id, const step_handle_t& step) const {
auto curr_steps = node_offset[get_node_idx(id)];
auto step_idx = step_offset[get_step_idx(step)];
bool has_prev = step_idx > curr_steps;
if (has_prev) {
return std::make_pair(true, node_steps[step_idx-1]);
} else {
step_handle_t empty_step;
std::pair<step_handle_t,uint64_t> empty_step;
return std::make_pair(false, empty_step);
}
}
Expand Down
8 changes: 5 additions & 3 deletions src/algorithms/stepindex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ struct path_step_index_t {
// map to the beginning of a range in node_steps
std::vector<uint64_t> node_offset;
// record the steps in positional order by node (index given in node_offset)
std::vector<step_handle_t> node_steps;
std::vector<std::pair<step_handle_t, uint64_t>> node_steps;
// map from step to an index in step_offset
boophf_step_t* step_mphf = nullptr;
// index in handle_steps for the given step
Expand All @@ -114,9 +114,11 @@ struct path_step_index_t {
uint64_t n_steps_on_node(const nid_t& id) const;
// these functions require, but do not check, that our step is in the indexed path
// next step on node (sorted by position in path), (false, _) if there is no next step
std::pair<bool, step_handle_t> get_next_step_on_node(const nid_t& id, const step_handle_t& step) const;
std::pair<bool, std::pair<step_handle_t, uint64_t>>
get_next_step_on_node(const nid_t& id, const step_handle_t& step) const;
// prev step on node (sorted by position in path), (false, _) if there is no next step
std::pair<bool, step_handle_t> get_prev_step_on_node(const nid_t& id, const step_handle_t& step) const;
std::pair<bool, std::pair<step_handle_t, uint64_t>>
get_prev_step_on_node(const nid_t& id, const step_handle_t& step) const;
};

}
Expand Down
117 changes: 73 additions & 44 deletions src/algorithms/untangle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@ std::vector<step_handle_t> untangle_cuts(
const std::function<bool(const handle_t&)>& is_cut) {
auto path = graph.get_path_handle_of_step(_start);
auto path_name = graph.get_path_name(path);
// this assumes that the end is not inclusive

/*
std::cerr << "untangle_cuts(" << path_name << ", "
<< step_index.get_position(_start) << ", "
<< step_index.get_position(_end) << ")" << std::endl;
*/
// TODO make a vector of handles we've seen as segment starts and of segment ends
// to avoid duplicating work

std::vector<bool> seen_fwd_step(self_index.step_count);
std::vector<bool> seen_rev_step(self_index.step_count);
Expand All @@ -32,45 +30,77 @@ std::vector<step_handle_t> untangle_cuts(
return seen_rev_step[self_index.get_step_idx(step)];
};
auto mark_seen_fwd_step = [&seen_fwd_step,&self_index](const step_handle_t& step) {
seen_fwd_step[self_index.get_step_idx(step)] = true;
auto idx = self_index.get_step_idx(step);
bool state = seen_fwd_step[idx];
seen_fwd_step[idx] = true;
return state;
};
auto mark_seen_rev_step = [&seen_rev_step,&self_index](const step_handle_t& step) {
seen_rev_step[self_index.get_step_idx(step)] = true;
auto idx = self_index.get_step_idx(step);
bool state = seen_rev_step[idx];
seen_rev_step[idx] = true;
return state;
};

std::vector<std::pair<uint64_t, step_handle_t>> cut_points;
auto clean_cut_points = [&cut_points,&step_index,&graph](void) {
std::sort(cut_points.begin(),
cut_points.end(),
[&](const std::pair<uint64_t,step_handle_t>& a,
const std::pair<uint64_t,step_handle_t>& b) {
return a.first < b.first;
});
std::vector<step_handle_t> c;
for (auto& x : cut_points) c.push_back(x.second);
// then take unique positions
c.erase(std::unique(c.begin(),
c.end()),
c.end());
return c;
};
std::vector<step_handle_t> cut_points;

std::deque<std::pair<step_handle_t, step_handle_t>> todo;
todo.push_back(std::make_pair(_start, _end));
while (!todo.empty()) {
auto start = todo.front().first;
auto end = todo.front().second;
uint64_t start_pos = step_index.get_position(start, graph);
uint64_t curr_pos = start_pos;
uint64_t end_pos = step_index.get_position(end, graph);
//std::cerr << "todo: " << start_pos << " " << end_pos << std::endl;
cut_points.push_back(start);
cut_points.push_back(std::pair(curr_pos, start));
todo.pop_front();
// we go forward until we see a loop, where the other step has position < end_pos and > start_pos
for (step_handle_t step = start; step != end; step = graph.get_next_step(step)) {
for (step_handle_t step = start;
step != end;
step = graph.get_next_step(step)) {
// we take the first and shortest loop we find
// TODO change this, it can be computed based on the node length
if (is_seen_fwd_step(step)) {
handle_t handle = graph.get_handle_of_step(step);
if (mark_seen_fwd_step(step)) {
curr_pos += graph.get_length(handle);
continue;
}
auto curr_pos = step_index.get_position(step, graph);
handle_t handle = graph.get_handle_of_step(step);
if (is_cut(handle)) {
cut_points.push_back(step);
/*
if (curr_pos != step_index.get_position(step, graph)) {
std::cerr << "position error fwd " << curr_pos << " " << step_index.get_position(step, graph) << std::endl;
exit(1);
}
*/
cut_points.push_back(std::pair(curr_pos, step));
//cut_points.push_back(std::make_pair(step_index.get_position(step, graph), step));
}
mark_seen_fwd_step(step);
bool found_loop = false;
step_handle_t other;
uint64_t other_pos = 0;
auto x = self_index.get_next_step_on_node(graph.get_id(handle), step);
if (x.first) {
auto other_pos = step_index.get_position(x.second, graph);
other_pos = x.second.second;
if (other_pos > start_pos
&& other_pos < end_pos
&& other_pos > curr_pos
&& !is_seen_fwd_step(x.second)) {
other = x.second;
&& !is_seen_fwd_step(x.second.first)) {
other = x.second.first;
found_loop = true;
}
}
Expand All @@ -80,7 +110,10 @@ std::vector<step_handle_t> untangle_cuts(
//std::cerr << "Found loop! " << step_index.get_position(step) << " " << step_index.get_position(other) << std::endl;
todo.push_back(std::make_pair(step, other));
// we then step forward to the loop end and continue iterating
curr_pos = other_pos + graph.get_length(graph.get_handle_of_step(other));
step = other;
} else {
curr_pos += graph.get_length(handle);
}
}
// TODO this block is the same as the previous one, but in reverse
Expand All @@ -89,34 +122,42 @@ std::vector<step_handle_t> untangle_cuts(
// now we reverse it
step_handle_t path_begin = graph.path_begin(path);
if (end == path_begin || !graph.has_previous_step(end)) {
return cut_points;
return clean_cut_points();
}
//step_handle_t _step = graph.get_previous_step(end);
curr_pos = step_index.get_position(end, graph) + graph.get_length(graph.get_handle_of_step(end));
//cut_points.push_back(std::make_pair(curr_pos, end));
//std::cerr << "reversing" << std::endl;
for (step_handle_t step = end;
step_index.get_position(step, graph) > start_pos;
step != start;
step = graph.get_previous_step(step)) {
if (is_seen_rev_step(step)) {
handle_t handle = graph.get_handle_of_step(step);
curr_pos -= graph.get_length(handle);
if (mark_seen_rev_step(step)) {
continue;
}
// we take the first and shortest loop we find
// TODO change this, it can be computed based on the node length
auto curr_pos = step_index.get_position(step, graph);
handle_t handle = graph.get_handle_of_step(step);
if (is_cut(handle)) {
cut_points.push_back(step);
/*
if (curr_pos != step_index.get_position(step, graph)) {
std::cerr << "position error rev " << curr_pos << " " << step_index.get_position(step, graph) << std::endl;
exit(1);
}
*/
cut_points.push_back(std::pair(curr_pos, step));
}
mark_seen_rev_step(step);
//std::cerr << "rev on step " << graph.get_id(handle) << " " << curr_pos << std::endl;
bool found_loop = false;
step_handle_t other;
auto x = self_index.get_prev_step_on_node(graph.get_id(handle), step);
uint64_t other_pos = 0;
if (x.first) {
auto other_pos = step_index.get_position(x.second, graph);
other_pos = x.second.second;
if (other_pos > start_pos
&& other_pos < end_pos
&& other_pos < curr_pos
&& !is_seen_rev_step(x.second)) {
other = x.second;
&& !is_seen_rev_step(x.second.first)) {
other = x.second.first;
found_loop = true;
}
}
Expand All @@ -125,25 +166,13 @@ std::vector<step_handle_t> untangle_cuts(
// to cut_points we add the start position, the result from recursion, and our end position
todo.push_back(std::make_pair(other, step));
// we then step forward to the loop end and continue iterating
curr_pos = other_pos;
step = other;
}
}
cut_points.push_back(end);
}
// and sort
std::sort(cut_points.begin(),
cut_points.end(),
[&](const step_handle_t& a,
const step_handle_t& b) {
return step_index.get_position(a, graph) < step_index.get_position(b, graph);
});
//auto prev_size = cut_points.size();
// then take unique positions
cut_points.erase(std::unique(cut_points.begin(),
cut_points.end()),
cut_points.end());
//std::cerr << "prev_size " << prev_size << " curr_size " << cut_points.size() << std::endl;
return cut_points;
return clean_cut_points();
}

void write_cuts(
Expand Down Expand Up @@ -835,7 +864,7 @@ void untangle(
if (show_progress) {
progress->finish();
}

// todo: split up the graph space into regions of cut_every bp
// write a map from node ids to segments
// walk along every path
Expand Down Expand Up @@ -869,7 +898,7 @@ void untangle(
if (show_progress) {
progress->finish();
}

}
} else {
uint64_t num_cut_points_read = 0;
Expand Down
11 changes: 10 additions & 1 deletion src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ const std::string& node_t::get_sequence() const {
return sequence;
}

void node_t::set_volatile(void) {
dynamic = true;
}

void node_t::set_static(void) {
dynamic = false;
}

// encode an internal representation of an external id (adding if none exists)
uint64_t node_t::encode(const uint64_t& other_id) {
uint64_t delta = to_delta(other_id);
Expand Down Expand Up @@ -333,6 +341,7 @@ void node_t::clear_encoding() {
void node_t::copy(const node_t& other) {
clear();
id = other.id;
dynamic = other.dynamic;
sequence = other.sequence;
edges = other.edges;
decoding = other.decoding;
Expand Down Expand Up @@ -439,7 +448,7 @@ void node_t::load(std::istream& in) {
in.read((char*)sequence.c_str(), len*sizeof(uint8_t));
in.read((char*)&id, sizeof(id));
edges.load(in);
decoding.load(in);
decoding.load(in);
paths.load(in);
//display();
}
Expand Down
15 changes: 11 additions & 4 deletions src/node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const uint8_t PATH_RECORD_LENGTH = 6;
class node_t {
uint64_t id = 0;
std::atomic_flag lock = ATOMIC_FLAG_INIT;
bool dynamic = true;
std::string sequence;
dyn::hacked_vector edges;
dyn::hacked_vector decoding;
Expand Down Expand Up @@ -82,16 +83,20 @@ class node_t {
return type & (1 << 3);
}
};

public:
node_t(void); // constructor
// locking methods
inline void get_lock(void) {
while (lock.test_and_set(std::memory_order_acquire)) // acquire lock
; // spin
if (dynamic) {
while (lock.test_and_set(std::memory_order_acquire)) // acquire lock
; // spin
}
}
inline void clear_lock(void) {
lock.clear(std::memory_order_release);
if (dynamic) {
lock.clear(std::memory_order_release);
}
}
inline const uint64_t edge_count(void) const { return edges.size()/EDGE_RECORD_LENGTH; }
inline const uint64_t path_count(void) const { return paths.size()/PATH_RECORD_LENGTH; }
Expand All @@ -114,6 +119,8 @@ class node_t {
void set_sequence(const std::string& seq);
const uint64_t& get_id(void) const;
void set_id(const uint64_t& new_id);
void set_volatile(void);
void set_static(void);
void for_each_edge(const std::function<bool(uint64_t other_id,
bool on_rev,
bool other_rev,
Expand Down
12 changes: 12 additions & 0 deletions src/odgi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1725,4 +1725,16 @@ void graph_t::copy(const graph_t& other) {
});
}

void graph_t::set_static(void) {
for (auto& node : node_v) {
node->set_static();
}
}

void graph_t::set_volatile(void) {
for (auto& node : node_v) {
node->set_volatile();
}
}

}
Loading