diff --git a/CMakeLists.txt b/CMakeLists.txt index 28318875..0172ace5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ endif() # Set optimization through command line; see INSTALL.md if (${CMAKE_BUILD_TYPE} MATCHES Release) set(EXTRA_FLAGS "-Ofast -march=native -pipe -msse4.2 -funroll-all-loops") # -fprofile-generate=../pgo") + #set(EXTRA_FLAGS "-O0 -march=native -msse4.2 -pg") # for profiling with gprof set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG") # reset CXX_FLAGS to be able to replace -O3 with -Ofast endif () diff --git a/src/algorithms/stepindex.cpp b/src/algorithms/stepindex.cpp index cf9332c9..194559ae 100644 --- a/src/algorithms/stepindex.cpp +++ b/src/algorithms/stepindex.cpp @@ -275,14 +275,14 @@ path_step_index_t::path_step_index_t(const PathHandleGraph& graph, node_offset[0] = 0; // first offset is 0 by definition for (auto& node_step : steps_by_node) { auto& idx = std::get<0>(node_step); - //auto& offset = std::get<1>(node_step); // just used for sorting + auto& offset = std::get<1>(node_step); // just used for sorting auto& step = std::get<2>(node_step); //std::cerr << "idx = " << idx << " " << as_integers(step)[0] << ":" << as_integers(step)[1] << std::endl; if (idx != last_idx) { node_offset[idx] = node_steps.size(); } step_offset[step_mphf->lookup(step)] = node_steps.size(); - node_steps.push_back(step); + node_steps.push_back(std::make_pair(step, offset)); last_idx = idx; } if (last_idx != node_count-1) { @@ -313,7 +313,7 @@ uint64_t path_step_index_t::n_steps_on_node(const nid_t& id) const { return node_offset[idx+1] - node_offset[idx]; } -std::pair +std::pair> path_step_index_t::get_next_step_on_node(const nid_t& id, const step_handle_t& step) const { auto node_idx = get_node_idx(id); auto curr_steps = node_offset[node_idx]; @@ -323,12 +323,12 @@ path_step_index_t::get_next_step_on_node(const nid_t& id, const step_handle_t& s if (has_next) { return std::make_pair(true, node_steps[step_idx+1]); } else { - step_handle_t empty_step; + std::pair empty_step; return std::make_pair(false, empty_step); } } -std::pair +std::pair> path_step_index_t::get_prev_step_on_node(const nid_t& id, const step_handle_t& step) const { auto curr_steps = node_offset[get_node_idx(id)]; auto step_idx = step_offset[get_step_idx(step)]; @@ -336,7 +336,7 @@ path_step_index_t::get_prev_step_on_node(const nid_t& id, const step_handle_t& s if (has_prev) { return std::make_pair(true, node_steps[step_idx-1]); } else { - step_handle_t empty_step; + std::pair empty_step; return std::make_pair(false, empty_step); } } diff --git a/src/algorithms/stepindex.hpp b/src/algorithms/stepindex.hpp index 3db8d5de..edf504ab 100644 --- a/src/algorithms/stepindex.hpp +++ b/src/algorithms/stepindex.hpp @@ -99,7 +99,7 @@ struct path_step_index_t { // map to the beginning of a range in node_steps std::vector node_offset; // record the steps in positional order by node (index given in node_offset) - std::vector node_steps; + std::vector> node_steps; // map from step to an index in step_offset boophf_step_t* step_mphf = nullptr; // index in handle_steps for the given step @@ -114,9 +114,11 @@ struct path_step_index_t { uint64_t n_steps_on_node(const nid_t& id) const; // these functions require, but do not check, that our step is in the indexed path // next step on node (sorted by position in path), (false, _) if there is no next step - std::pair get_next_step_on_node(const nid_t& id, const step_handle_t& step) const; + std::pair> + get_next_step_on_node(const nid_t& id, const step_handle_t& step) const; // prev step on node (sorted by position in path), (false, _) if there is no next step - std::pair get_prev_step_on_node(const nid_t& id, const step_handle_t& step) const; + std::pair> + get_prev_step_on_node(const nid_t& id, const step_handle_t& step) const; }; } diff --git a/src/algorithms/untangle.cpp b/src/algorithms/untangle.cpp index 735f0b8c..88e3a53f 100644 --- a/src/algorithms/untangle.cpp +++ b/src/algorithms/untangle.cpp @@ -14,14 +14,12 @@ std::vector untangle_cuts( const std::function& is_cut) { auto path = graph.get_path_handle_of_step(_start); auto path_name = graph.get_path_name(path); - // this assumes that the end is not inclusive + /* std::cerr << "untangle_cuts(" << path_name << ", " << step_index.get_position(_start) << ", " << step_index.get_position(_end) << ")" << std::endl; */ - // TODO make a vector of handles we've seen as segment starts and of segment ends - // to avoid duplicating work std::vector seen_fwd_step(self_index.step_count); std::vector seen_rev_step(self_index.step_count); @@ -32,45 +30,77 @@ std::vector untangle_cuts( return seen_rev_step[self_index.get_step_idx(step)]; }; auto mark_seen_fwd_step = [&seen_fwd_step,&self_index](const step_handle_t& step) { - seen_fwd_step[self_index.get_step_idx(step)] = true; + auto idx = self_index.get_step_idx(step); + bool state = seen_fwd_step[idx]; + seen_fwd_step[idx] = true; + return state; }; auto mark_seen_rev_step = [&seen_rev_step,&self_index](const step_handle_t& step) { - seen_rev_step[self_index.get_step_idx(step)] = true; + auto idx = self_index.get_step_idx(step); + bool state = seen_rev_step[idx]; + seen_rev_step[idx] = true; + return state; + }; + + std::vector> cut_points; + auto clean_cut_points = [&cut_points,&step_index,&graph](void) { + std::sort(cut_points.begin(), + cut_points.end(), + [&](const std::pair& a, + const std::pair& b) { + return a.first < b.first; + }); + std::vector c; + for (auto& x : cut_points) c.push_back(x.second); + // then take unique positions + c.erase(std::unique(c.begin(), + c.end()), + c.end()); + return c; }; - std::vector cut_points; + std::deque> todo; todo.push_back(std::make_pair(_start, _end)); while (!todo.empty()) { auto start = todo.front().first; auto end = todo.front().second; uint64_t start_pos = step_index.get_position(start, graph); + uint64_t curr_pos = start_pos; uint64_t end_pos = step_index.get_position(end, graph); //std::cerr << "todo: " << start_pos << " " << end_pos << std::endl; - cut_points.push_back(start); + cut_points.push_back(std::pair(curr_pos, start)); todo.pop_front(); // we go forward until we see a loop, where the other step has position < end_pos and > start_pos - for (step_handle_t step = start; step != end; step = graph.get_next_step(step)) { + for (step_handle_t step = start; + step != end; + step = graph.get_next_step(step)) { // we take the first and shortest loop we find - // TODO change this, it can be computed based on the node length - if (is_seen_fwd_step(step)) { + handle_t handle = graph.get_handle_of_step(step); + if (mark_seen_fwd_step(step)) { + curr_pos += graph.get_length(handle); continue; } - auto curr_pos = step_index.get_position(step, graph); - handle_t handle = graph.get_handle_of_step(step); if (is_cut(handle)) { - cut_points.push_back(step); + /* + if (curr_pos != step_index.get_position(step, graph)) { + std::cerr << "position error fwd " << curr_pos << " " << step_index.get_position(step, graph) << std::endl; + exit(1); + } + */ + cut_points.push_back(std::pair(curr_pos, step)); + //cut_points.push_back(std::make_pair(step_index.get_position(step, graph), step)); } - mark_seen_fwd_step(step); bool found_loop = false; step_handle_t other; + uint64_t other_pos = 0; auto x = self_index.get_next_step_on_node(graph.get_id(handle), step); if (x.first) { - auto other_pos = step_index.get_position(x.second, graph); + other_pos = x.second.second; if (other_pos > start_pos && other_pos < end_pos && other_pos > curr_pos - && !is_seen_fwd_step(x.second)) { - other = x.second; + && !is_seen_fwd_step(x.second.first)) { + other = x.second.first; found_loop = true; } } @@ -80,7 +110,10 @@ std::vector untangle_cuts( //std::cerr << "Found loop! " << step_index.get_position(step) << " " << step_index.get_position(other) << std::endl; todo.push_back(std::make_pair(step, other)); // we then step forward to the loop end and continue iterating + curr_pos = other_pos + graph.get_length(graph.get_handle_of_step(other)); step = other; + } else { + curr_pos += graph.get_length(handle); } } // TODO this block is the same as the previous one, but in reverse @@ -89,34 +122,42 @@ std::vector untangle_cuts( // now we reverse it step_handle_t path_begin = graph.path_begin(path); if (end == path_begin || !graph.has_previous_step(end)) { - return cut_points; + return clean_cut_points(); } + //step_handle_t _step = graph.get_previous_step(end); + curr_pos = step_index.get_position(end, graph) + graph.get_length(graph.get_handle_of_step(end)); + //cut_points.push_back(std::make_pair(curr_pos, end)); //std::cerr << "reversing" << std::endl; for (step_handle_t step = end; - step_index.get_position(step, graph) > start_pos; + step != start; step = graph.get_previous_step(step)) { - if (is_seen_rev_step(step)) { + handle_t handle = graph.get_handle_of_step(step); + curr_pos -= graph.get_length(handle); + if (mark_seen_rev_step(step)) { continue; } // we take the first and shortest loop we find - // TODO change this, it can be computed based on the node length - auto curr_pos = step_index.get_position(step, graph); - handle_t handle = graph.get_handle_of_step(step); if (is_cut(handle)) { - cut_points.push_back(step); + /* + if (curr_pos != step_index.get_position(step, graph)) { + std::cerr << "position error rev " << curr_pos << " " << step_index.get_position(step, graph) << std::endl; + exit(1); + } + */ + cut_points.push_back(std::pair(curr_pos, step)); } - mark_seen_rev_step(step); //std::cerr << "rev on step " << graph.get_id(handle) << " " << curr_pos << std::endl; bool found_loop = false; step_handle_t other; auto x = self_index.get_prev_step_on_node(graph.get_id(handle), step); + uint64_t other_pos = 0; if (x.first) { - auto other_pos = step_index.get_position(x.second, graph); + other_pos = x.second.second; if (other_pos > start_pos && other_pos < end_pos && other_pos < curr_pos - && !is_seen_rev_step(x.second)) { - other = x.second; + && !is_seen_rev_step(x.second.first)) { + other = x.second.first; found_loop = true; } } @@ -125,25 +166,13 @@ std::vector untangle_cuts( // to cut_points we add the start position, the result from recursion, and our end position todo.push_back(std::make_pair(other, step)); // we then step forward to the loop end and continue iterating + curr_pos = other_pos; step = other; } } - cut_points.push_back(end); } // and sort - std::sort(cut_points.begin(), - cut_points.end(), - [&](const step_handle_t& a, - const step_handle_t& b) { - return step_index.get_position(a, graph) < step_index.get_position(b, graph); - }); - //auto prev_size = cut_points.size(); - // then take unique positions - cut_points.erase(std::unique(cut_points.begin(), - cut_points.end()), - cut_points.end()); - //std::cerr << "prev_size " << prev_size << " curr_size " << cut_points.size() << std::endl; - return cut_points; + return clean_cut_points(); } void write_cuts( @@ -835,7 +864,7 @@ void untangle( if (show_progress) { progress->finish(); } - + // todo: split up the graph space into regions of cut_every bp // write a map from node ids to segments // walk along every path @@ -869,7 +898,7 @@ void untangle( if (show_progress) { progress->finish(); } - + } } else { uint64_t num_cut_points_read = 0; diff --git a/src/node.cpp b/src/node.cpp index 04cfca13..51bb3c70 100644 --- a/src/node.cpp +++ b/src/node.cpp @@ -22,6 +22,14 @@ const std::string& node_t::get_sequence() const { return sequence; } +void node_t::set_volatile(void) { + dynamic = true; +} + +void node_t::set_static(void) { + dynamic = false; +} + // encode an internal representation of an external id (adding if none exists) uint64_t node_t::encode(const uint64_t& other_id) { uint64_t delta = to_delta(other_id); @@ -333,6 +341,7 @@ void node_t::clear_encoding() { void node_t::copy(const node_t& other) { clear(); id = other.id; + dynamic = other.dynamic; sequence = other.sequence; edges = other.edges; decoding = other.decoding; @@ -439,7 +448,7 @@ void node_t::load(std::istream& in) { in.read((char*)sequence.c_str(), len*sizeof(uint8_t)); in.read((char*)&id, sizeof(id)); edges.load(in); - decoding.load(in); + decoding.load(in); paths.load(in); //display(); } diff --git a/src/node.hpp b/src/node.hpp index 3c91c651..277e729c 100644 --- a/src/node.hpp +++ b/src/node.hpp @@ -26,6 +26,7 @@ const uint8_t PATH_RECORD_LENGTH = 6; class node_t { uint64_t id = 0; std::atomic_flag lock = ATOMIC_FLAG_INIT; + bool dynamic = true; std::string sequence; dyn::hacked_vector edges; dyn::hacked_vector decoding; @@ -82,16 +83,20 @@ class node_t { return type & (1 << 3); } }; - + public: node_t(void); // constructor // locking methods inline void get_lock(void) { - while (lock.test_and_set(std::memory_order_acquire)) // acquire lock - ; // spin + if (dynamic) { + while (lock.test_and_set(std::memory_order_acquire)) // acquire lock + ; // spin + } } inline void clear_lock(void) { - lock.clear(std::memory_order_release); + if (dynamic) { + lock.clear(std::memory_order_release); + } } inline const uint64_t edge_count(void) const { return edges.size()/EDGE_RECORD_LENGTH; } inline const uint64_t path_count(void) const { return paths.size()/PATH_RECORD_LENGTH; } @@ -114,6 +119,8 @@ class node_t { void set_sequence(const std::string& seq); const uint64_t& get_id(void) const; void set_id(const uint64_t& new_id); + void set_volatile(void); + void set_static(void); void for_each_edge(const std::functionset_static(); + } +} + +void graph_t::set_volatile(void) { + for (auto& node : node_v) { + node->set_volatile(); + } +} + } diff --git a/src/odgi.hpp b/src/odgi.hpp index 7f2d5eb1..495df37f 100644 --- a/src/odgi.hpp +++ b/src/odgi.hpp @@ -521,6 +521,12 @@ class graph_t : public MutablePathDeletableHandleGraph, public SerializableHandl /// get the backing node rank for a given node id uint64_t get_node_rank(const nid_t& node_id) const; + /// set the graph into static mode, which avoids spinlocks + void set_static(void); + + /// set the graph into volatile mode, which uses (spin)locks for dynamic hogwilding + void set_volatile(void); + }; //const static uint64_t path_begin_marker = std::numeric_limits::max(); diff --git a/src/subcommand/untangle_main.cpp b/src/subcommand/untangle_main.cpp index bcdcc179..25570e27 100644 --- a/src/subcommand/untangle_main.cpp +++ b/src/subcommand/untangle_main.cpp @@ -62,8 +62,10 @@ int main_untangle(int argc, char **argv) { args::Flag make_self_dotplot(debugging_opts, "DOTPLOT", "Render a table showing the positional dotplot of the query against itself.", {'S', "self-dotplot"}); args::Group step_index_opts(parser, "[ Step Index Options ]"); - args::ValueFlag _step_index(step_index_opts, "FILE", "Load the step index from this *FILE*. The file name usually ends with *.stpidx*. (default: build the step index from scratch with a sampling rate of 8).", + args::ValueFlag _step_index(step_index_opts, "FILE", "Load the step index from this *FILE*. The file name usually ends with *.stpidx*. (default: build the step index from scratch with sampling rate 8).", {'a', "step-index"}); + args::ValueFlag _step_sampling(step_index_opts, "N", "Sampling rate to use for step index construction. (default: 8).", + {'D', "step-sampling"}); args::Group threading(parser, "[ Threading ]"); args::ValueFlag nthreads( threading, "N", "Number of threads to use for parallel operations.", {'t', "threads"}); @@ -110,6 +112,7 @@ int main_untangle(int argc, char **argv) { graph); } } + graph.set_static(); } // path loading @@ -208,13 +211,14 @@ int main_untangle(int argc, char **argv) { } } else { if (!_step_index) { + auto step_sampling = _step_sampling ? args::get(_step_sampling) : 8; if (progress) { std::cerr - << "[odgi::untangle] warning: no step index specified. Building one with a sample rate of 8. This may take additional time. " - "A step index can be provided via -a, --step-index. A step index can be built using odgi stepindex." - << std::endl; + << "[odgi::untangle] warning: no step index specified. Building one with a sample rate of " << step_sampling << ". This may take additional time. " + "A step index can be provided via -a, --step-index. A step index can be built using odgi stepindex." + << std::endl; } - algorithms::step_index_t step_index(graph, paths, num_threads, progress, 8); + algorithms::step_index_t step_index(graph, paths, num_threads, progress, step_sampling); algorithms::untangle(graph, query_paths, target_paths,