From 68d4f11eac06b2c0069b339fb5b4c3c2e70f3684 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 17 Oct 2022 16:40:05 +0200 Subject: [PATCH 01/29] prepare basic functions for ssi integration --- src/algorithms/stepindex.cpp | 4 ++ src/algorithms/stepindex.hpp | 1 + src/subcommand/sort_main.cpp | 32 +++++++---- src/unittest/stepindex.cpp | 101 +++-------------------------------- 4 files changed, 35 insertions(+), 103 deletions(-) diff --git a/src/algorithms/stepindex.cpp b/src/algorithms/stepindex.cpp index dc8ee953..a246426c 100644 --- a/src/algorithms/stepindex.cpp +++ b/src/algorithms/stepindex.cpp @@ -129,6 +129,10 @@ void step_index_t::load(const std::string& name) { step_mphf->load(stpidx_in); } +const uint64_t step_index_t::get_sample_rate() { + return this->sample_rate; +} + void step_index_t::serialize_members(std::ostream &out) const { serialize_and_measure(out); } diff --git a/src/algorithms/stepindex.hpp b/src/algorithms/stepindex.hpp index 3db8d5de..66684c9c 100644 --- a/src/algorithms/stepindex.hpp +++ b/src/algorithms/stepindex.hpp @@ -63,6 +63,7 @@ struct step_index_t { const uint64_t get_path_len(const path_handle_t& path) const; void save(const std::string& name) const; void load(const std::string& name); + const uint64_t get_sample_rate(); // map from step to position in its path boophf_step_t* step_mphf = nullptr; sdsl::int_vector<64> pos; diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 5b7f6ccb..3fd4ec7a 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -14,6 +14,7 @@ #include "algorithms/xp.hpp" #include "algorithms/path_sgd.hpp" #include "algorithms/groom.hpp" +#include "algorithms/stepindex.hpp" namespace odgi { @@ -38,6 +39,7 @@ int main_sort(int argc, char** argv) { " ending with *.og* is recommended.", {'o', "out"}); args::Group files_io_opts(parser, "[ Files IO Options ]"); args::ValueFlag xp_in_file(files_io_opts, "FILE", "Load the succinct variation graph index from this *FILE*. The file name usually ends with *.xp*.", {'X', "path-index"}); + args::ValueFlag ssi_in_file(files_io_opts, "FILE", "Load the sampled step index from this *FILE*. The file name usually ends with *.ssi*.", {'e', "sampled-step-index"}); args::ValueFlag sort_order_in(files_io_opts, "FILE", "*FILE* containing the sort order. Each line contains one node identifer.", {'s', "sort-order"}); args::ValueFlag tmp_base(files_io_opts, "PATH", "directory for temporary files", {'C', "temp-dir"}); args::Group topo_sorts_opts(parser, "[ Topological Sort Options ]"); @@ -171,25 +173,25 @@ int main_sort(int argc, char** argv) { /// path guided linear 1D SGD sort helpers // TODO beautify this, maybe put into its own file std::function &, - const xp::XP &)> get_sum_path_step_count - = [&](const std::vector &path_sgd_use_paths, const xp::XP &path_index) { + graph_t &)> get_sum_path_step_count + = [&](const std::vector &path_sgd_use_paths, graph_t &graph) { uint64_t sum_path_step_count = 0; for (auto& path : path_sgd_use_paths) { - sum_path_step_count += path_index.get_path_step_count(path); + sum_path_step_count += graph.get_step_count(path); } return sum_path_step_count; }; std::function &, - const xp::XP &)> get_max_path_step_count - = [&](const std::vector &path_sgd_use_paths, const xp::XP &path_index) { + graph_t &)> get_max_path_step_count + = [&](const std::vector &path_sgd_use_paths, graph_t &graph) { uint64_t max_path_step_count = 0; for (auto& path : path_sgd_use_paths) { - max_path_step_count = std::max(max_path_step_count, path_index.get_path_step_count(path)); + max_path_step_count = std::max(max_path_step_count, graph.get_step_count(path)); } return max_path_step_count; }; std::function &, - const xp::XP &)> get_max_path_length + const xp::XP &)> get_max_path_length_xp = [&](const std::vector &path_sgd_use_paths, const xp::XP &path_index) { uint64_t max_path_length = std::numeric_limits::min(); for (auto &path : path_sgd_use_paths) { @@ -198,6 +200,16 @@ int main_sort(int argc, char** argv) { return max_path_length; }; + std::function &, + const algorithms::step_index_t &)> get_max_path_length_ssi + = [&](const std::vector &path_sgd_use_paths, const algorithms::step_index_t &sampled_step_index) { + uint64_t max_path_length = std::numeric_limits::min(); + for (auto &path : path_sgd_use_paths) { + max_path_length = std::max(max_path_length, sampled_step_index.get_path_len(path)); + } + return max_path_length; + }; + // default parameters std::string path_sgd_seed; if (p_sgd_seed) { @@ -364,7 +376,7 @@ int main_sort(int argc, char** argv) { path_sgd_use_paths.push_back(path); }); } - uint64_t sum_path_step_count = get_sum_path_step_count(path_sgd_use_paths, path_index); + uint64_t sum_path_step_count = get_sum_path_step_count(path_sgd_use_paths, graph); if (args::get(p_sgd_min_term_updates_paths)) { path_sgd_min_term_updates = args::get(p_sgd_min_term_updates_paths) * sum_path_step_count; } else { @@ -374,8 +386,8 @@ int main_sort(int argc, char** argv) { path_sgd_min_term_updates = 1.0 * sum_path_step_count; } } - uint64_t max_path_step_count = get_max_path_step_count(path_sgd_use_paths, path_index); - path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : get_max_path_length(path_sgd_use_paths, path_index); + uint64_t max_path_step_count = get_max_path_step_count(path_sgd_use_paths, graph); + path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : get_max_path_length_xp(path_sgd_use_paths, path_index); path_sgd_zipf_space_max = args::get(p_sgd_zipf_space_max) ? args::get(p_sgd_zipf_space_max) : 100; path_sgd_zipf_max_number_of_distributions = args::get(p_sgd_zipf_max_number_of_distributions) ? std::max( diff --git a/src/unittest/stepindex.cpp b/src/unittest/stepindex.cpp index 62f4d5c4..5f2bc180 100644 --- a/src/unittest/stepindex.cpp +++ b/src/unittest/stepindex.cpp @@ -79,6 +79,7 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 1.") { step_index_t step_index_1(graph, paths, 1, false, 1); + REQUIRE(step_index_1.get_sample_rate() == 1); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); if (cur_path == "target") { @@ -188,6 +189,7 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 2.") { step_index_t step_index_2(graph, paths, 1, false, 2); + REQUIRE(step_index_2.get_sample_rate() == 2); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); if (cur_path == "target") { @@ -281,99 +283,7 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 4.") { step_index_t step_index_4(graph, paths, 1, false, 4); - graph.for_each_path_handle([&](const path_handle_t path) { - std::string cur_path = graph.get_path_name(path); - if (cur_path == "target") { - uint64_t cur_step_rank = 0; - graph.for_each_step_in_path(path, [&](const step_handle_t& occ) { - switch(cur_step_rank) { - case 0: - REQUIRE(step_index_4.get_position(occ, graph) == 0); - break; - case 1: - REQUIRE(step_index_4.get_position(occ, graph) == 1); - break; - case 2: - REQUIRE(step_index_4.get_position(occ, graph) == 2); - break; - case 3: - REQUIRE(step_index_4.get_position(occ, graph) == 5); - break; - case 4: - REQUIRE(step_index_4.get_position(occ, graph) == 8); - break; - case 5: - REQUIRE(step_index_4.get_position(occ, graph) == 11); - break; - } - cur_step_rank++; - }); - } - - if (cur_path == "query1") { - uint64_t cur_step_rank = 0; - graph.for_each_step_in_path(path, [&](const step_handle_t& occ) { - switch(cur_step_rank) { - case 0: - REQUIRE(step_index_4.get_position(occ, graph) == 0); - break; - case 1: - REQUIRE(step_index_4.get_position(occ, graph) == 1); - break; - case 2: - REQUIRE(step_index_4.get_position(occ, graph) == 2); - break; - case 3: - REQUIRE(step_index_4.get_position(occ, graph) == 5); - break; - } - cur_step_rank++; - }); - } - - if (cur_path == "query2") { - uint64_t cur_step_rank = 0; - graph.for_each_step_in_path(path, [&](const step_handle_t& occ) { - switch(cur_step_rank) { - case 0: - REQUIRE(step_index_4.get_position(occ, graph) == 0); - break; - } - cur_step_rank++; - }); - } - - if (cur_path == "query3") { - uint64_t cur_step_rank = 0; - graph.for_each_step_in_path(path, [&](const step_handle_t& occ) { - switch(cur_step_rank) { - case 0: - REQUIRE(step_index_4.get_position(occ, graph) == 0); - break; - case 1: - REQUIRE(step_index_4.get_position(occ, graph) == 3); - break; - case 2: - REQUIRE(step_index_4.get_position(occ, graph) == 4); - break; - case 3: - REQUIRE(step_index_4.get_position(occ, graph) == 5); - break; - case 4: - REQUIRE(step_index_4.get_position(occ, graph) == 8); - break; - case 5: - REQUIRE(step_index_4.get_position(occ, graph) == 11); - break; - } - cur_step_rank++; - }); - } - }); - } - - SECTION("The index delivers the correct positions for a given step. Sample rate: 4.") { - step_index_t step_index_4(graph, paths, 1, false, 4); + REQUIRE(step_index_4.get_sample_rate() == 4); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); if (cur_path == "target") { @@ -467,6 +377,7 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 8.") { step_index_t step_index_8(graph, paths, 1, false, 8); + REQUIRE(step_index_8.get_sample_rate() == 8); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); if (cur_path == "target") { @@ -560,6 +471,7 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 16.") { step_index_t step_index_16(graph, paths, 1, false, 16); + REQUIRE(step_index_16.get_sample_rate() == 16); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); if (cur_path == "target") { @@ -660,6 +572,9 @@ namespace odgi { step_index_t step_index_loaded; step_index_loaded.load(basename + "unittest.stpidx"); + REQUIRE(step_index_to_save.get_sample_rate() == 8); + REQUIRE(step_index_loaded.get_sample_rate() == 8); + graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); if (cur_path == "target") { From c00d7f38d334d025974283be73d5f52ca1bfb74f Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 19 Oct 2022 15:55:40 +0200 Subject: [PATCH 02/29] restructering --- src/algorithms/{ => pg_sgd}/path_sgd.cpp | 0 src/algorithms/{ => pg_sgd}/path_sgd.hpp | 8 +++---- .../{ => pg_sgd}/path_sgd_layout.cpp | 0 .../{ => pg_sgd}/path_sgd_layout.hpp | 4 ++-- src/subcommand/layout_main.cpp | 3 +-- src/subcommand/sort_main.cpp | 23 ++++++++++++++----- src/unittest/sort.cpp | 2 +- 7 files changed, 25 insertions(+), 15 deletions(-) rename src/algorithms/{ => pg_sgd}/path_sgd.cpp (100%) rename src/algorithms/{ => pg_sgd}/path_sgd.hpp (96%) rename src/algorithms/{ => pg_sgd}/path_sgd_layout.cpp (100%) rename src/algorithms/{ => pg_sgd}/path_sgd_layout.hpp (98%) diff --git a/src/algorithms/path_sgd.cpp b/src/algorithms/pg_sgd/path_sgd.cpp similarity index 100% rename from src/algorithms/path_sgd.cpp rename to src/algorithms/pg_sgd/path_sgd.cpp diff --git a/src/algorithms/path_sgd.hpp b/src/algorithms/pg_sgd/path_sgd.hpp similarity index 96% rename from src/algorithms/path_sgd.hpp rename to src/algorithms/pg_sgd/path_sgd.hpp index 977aaa32..7dd43701 100644 --- a/src/algorithms/path_sgd.hpp +++ b/src/algorithms/pg_sgd/path_sgd.hpp @@ -10,17 +10,17 @@ #include #include #include -#include "xp.hpp" -#include "sgd_term.hpp" +#include "../xp.hpp" +#include "../sgd_term.hpp" #include "IITree.h" #include #include #include -#include "weakly_connected_components.hpp" +#include "../weakly_connected_components.hpp" #include #include "dirty_zipfian_int_distribution.h" #include "XoshiroCpp.hpp" -#include "progress.hpp" +#include "../progress.hpp" #include "utils.hpp" #include diff --git a/src/algorithms/path_sgd_layout.cpp b/src/algorithms/pg_sgd/path_sgd_layout.cpp similarity index 100% rename from src/algorithms/path_sgd_layout.cpp rename to src/algorithms/pg_sgd/path_sgd_layout.cpp diff --git a/src/algorithms/path_sgd_layout.hpp b/src/algorithms/pg_sgd/path_sgd_layout.hpp similarity index 98% rename from src/algorithms/path_sgd_layout.hpp rename to src/algorithms/pg_sgd/path_sgd_layout.hpp index 26824841..b2d6fada 100644 --- a/src/algorithms/path_sgd_layout.hpp +++ b/src/algorithms/pg_sgd/path_sgd_layout.hpp @@ -10,7 +10,7 @@ #include #include #include -#include "xp.hpp" +#include "../xp.hpp" #include "IITree.h" #include #include @@ -18,7 +18,7 @@ #include #include "dirty_zipfian_int_distribution.h" #include "XoshiroCpp.hpp" -#include "progress.hpp" +#include "../progress.hpp" namespace odgi { namespace algorithms { diff --git a/src/subcommand/layout_main.cpp b/src/subcommand/layout_main.cpp index 1a3645b8..c3d8b13f 100644 --- a/src/subcommand/layout_main.cpp +++ b/src/subcommand/layout_main.cpp @@ -4,8 +4,7 @@ #include "args.hxx" #include #include "algorithms/xp.hpp" -#include "algorithms/sgd_layout.hpp" -#include "algorithms/path_sgd_layout.hpp" +#include "algorithms/pg_sgd/path_sgd_layout.hpp" #include "algorithms/draw.hpp" #include "algorithms/layout.hpp" #include "hilbert.hpp" diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 3fd4ec7a..93c5f922 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -12,7 +12,7 @@ //#include "algorithms/mondriaan_sort.hpp" #include "algorithms/linear_sgd.hpp" #include "algorithms/xp.hpp" -#include "algorithms/path_sgd.hpp" +#include "algorithms/pg_sgd/path_sgd.hpp" #include "algorithms/groom.hpp" #include "algorithms/stepindex.hpp" @@ -151,6 +151,13 @@ int main_sort(int argc, char** argv) { return 1; } + if (xp_in_file && ssi_in_file) { + std::cerr + << "[odgi::sort] error: the PG-SGD algorithm can only work with an XP (odgi pathindex) or with a sample step index (odgi stepindex). Please only enter one of those." + << std::endl; + return 1; + } + const uint64_t num_threads = args::get(nthreads) ? args::get(nthreads) : 1; graph_t graph; @@ -227,6 +234,7 @@ int main_sort(int argc, char** argv) { return 1; } + // even if no XP is given, it is likely we will build one automatically if (tmp_base) { xp::temp_file::set_dir(args::get(tmp_base)); } else { @@ -333,7 +341,9 @@ int main_sort(int argc, char** argv) { uint64_t path_sgd_zipf_space, path_sgd_zipf_space_max, path_sgd_zipf_space_quantization_step, path_sgd_zipf_max_number_of_distributions; std::vector path_sgd_use_paths; xp::XP path_index; - bool fresh_path_index = false; + // TODO add SSI here + // const algorithms::step_index_t &sampled_step_index + bool fresh_step_index = false; std::string snapshot_prefix; if (snapshot) { snapshot_prefix = args::get(p_sgd_snapshot); @@ -345,7 +355,7 @@ int main_sort(int argc, char** argv) { target_paths = load_paths(args::get(_p_sgd_target_paths)); sort_graph_by_target_paths(graph, target_paths, is_ref); } - // take care of path index + // TODO take care of path index or sampled step index if (xp_in_file) { std::ifstream in; in.open(args::get(xp_in_file)); @@ -354,7 +364,7 @@ int main_sort(int argc, char** argv) { } else { path_index.from_handle_graph(graph, num_threads); } - fresh_path_index = true; + fresh_step_index = true; // do we only want so sample from a subset of paths? if (p_sgd_in_file) { std::string buf; @@ -450,7 +460,8 @@ int main_sort(int argc, char** argv) { order = algorithms::random_order(graph); break; case 'Y': { - if (!fresh_path_index) { + if (!fresh_step_index) { + // TODO what about the SSI? path_index.clean(); // do we have to sort by reference nodes first? if (_p_sgd_target_paths) { @@ -502,7 +513,7 @@ int main_sort(int argc, char** argv) { assert(false); } graph.apply_ordering(order, true); - fresh_path_index = false; + fresh_step_index = false; } } else if (args::get(two)) { graph.apply_ordering(algorithms::two_way_topological_order(&graph), true); diff --git a/src/unittest/sort.cpp b/src/unittest/sort.cpp index 35cc9212..9cd2f071 100644 --- a/src/unittest/sort.cpp +++ b/src/unittest/sort.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include "algorithms/pg_sgd/path_sgd.hpp" namespace odgi { namespace unittest { From 7e388e64e9d7c5ea04cb8c04462eb014100bc182 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 19 Oct 2022 15:56:06 +0200 Subject: [PATCH 03/29] forgetti spaghetti --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d8c732b..38e1b5e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -591,8 +591,8 @@ add_library(odgi_objs OBJECT ${CMAKE_SOURCE_DIR}/src/algorithms/unchop.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/perfect_neighbors.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/cover.cpp - ${CMAKE_SOURCE_DIR}/src/algorithms/path_sgd.cpp - ${CMAKE_SOURCE_DIR}/src/algorithms/path_sgd_layout.cpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd.cpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_layout.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/draw.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/layout.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/atomic_image.cpp @@ -759,8 +759,8 @@ set(odgi_HEADERS ${CMAKE_SOURCE_DIR}/src/algorithms/extract_extending_graph.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/extract_connecting_graph.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/cover.hpp - ${CMAKE_SOURCE_DIR}/src/algorithms/path_sgd.hpp - ${CMAKE_SOURCE_DIR}/src/algorithms/path_sgd_layout.hpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd.hpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_layout.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/kmer.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/expand_context.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/id_ordered_paths.hpp From 9a9aa7b25fbbcb0e225e8ef0d645c87744299151 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Tue, 25 Oct 2022 13:02:31 +0200 Subject: [PATCH 04/29] refactory --- CMakeLists.txt | 2 + src/algorithms/pg_sgd/path_sgd_helper.cpp | 40 +++++++++++++++++++ src/algorithms/pg_sgd/path_sgd_helper.hpp | 28 +++++++++++++ src/subcommand/layout_main.cpp | 26 ++---------- src/subcommand/sort_main.cpp | 48 +++-------------------- 5 files changed, 80 insertions(+), 64 deletions(-) create mode 100644 src/algorithms/pg_sgd/path_sgd_helper.cpp create mode 100644 src/algorithms/pg_sgd/path_sgd_helper.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 38e1b5e5..288ce106 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -593,6 +593,7 @@ add_library(odgi_objs OBJECT ${CMAKE_SOURCE_DIR}/src/algorithms/cover.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_layout.cpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_helper.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/draw.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/layout.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/atomic_image.cpp @@ -761,6 +762,7 @@ set(odgi_HEADERS ${CMAKE_SOURCE_DIR}/src/algorithms/cover.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_layout.hpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_helper.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/kmer.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/expand_context.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/id_ordered_paths.hpp diff --git a/src/algorithms/pg_sgd/path_sgd_helper.cpp b/src/algorithms/pg_sgd/path_sgd_helper.cpp new file mode 100644 index 00000000..6b1750e8 --- /dev/null +++ b/src/algorithms/pg_sgd/path_sgd_helper.cpp @@ -0,0 +1,40 @@ +#include "path_sgd_helper.hpp" + +namespace odgi { + namespace algorithms { + + using namespace handlegraph; + + const uint64_t get_sum_path_step_count(const std::vector &path_sgd_use_paths, graph_t &graph) { + uint64_t sum_path_step_count = 0; + for (auto& path : path_sgd_use_paths) { + sum_path_step_count += graph.get_step_count(path); + } + return sum_path_step_count; + }; + + const uint64_t get_max_path_step_count(const std::vector &path_sgd_use_paths, graph_t &graph) { + uint64_t max_path_step_count = 0; + for (auto& path : path_sgd_use_paths) { + max_path_step_count = std::max(max_path_step_count, graph.get_step_count(path)); + } + return max_path_step_count; + } + + const uint64_t get_max_path_length_xp(const std::vector &path_sgd_use_paths, const xp::XP &path_index) { + uint64_t max_path_length = std::numeric_limits::min(); + for (auto &path : path_sgd_use_paths) { + max_path_length = std::max(max_path_length, path_index.get_path_length(path)); + } + return max_path_length; + } + + const uint64_t get_max_path_length_ssi(const std::vector &path_sgd_use_paths, const algorithms::step_index_t &sampled_step_index) { + uint64_t max_path_length = std::numeric_limits::min(); + for (auto &path : path_sgd_use_paths) { + max_path_length = std::max(max_path_length, sampled_step_index.get_path_len(path)); + } + return max_path_length; + } + } +} \ No newline at end of file diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp new file mode 100644 index 00000000..52d45ef1 --- /dev/null +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "odgi.hpp" +#include "xp.hpp" +#include "algorithms/stepindex.hpp" + +namespace odgi { + namespace algorithms { + + using namespace handlegraph; + + /// get the total number of steps in the graph + const uint64_t get_sum_path_step_count(const std::vector &path_sgd_use_paths, + graph_t &graph); + + /// get the maximum step count of all paths in the graph + const uint64_t get_max_path_step_count(const std::vector &path_sgd_use_paths, + graph_t &graph); + + /// get the maximum path length in nucleotides using the XP + const uint64_t get_max_path_length_xp(const std::vector &path_sgd_use_paths, + const xp::XP &path_index); + + /// get the maximum path length in nucleotides using the SSI + const uint64_t get_max_path_length_ssi(const std::vector &path_sgd_use_paths, + const algorithms::step_index_t &sampled_step_index); + } +} \ No newline at end of file diff --git a/src/subcommand/layout_main.cpp b/src/subcommand/layout_main.cpp index c3d8b13f..907873cb 100644 --- a/src/subcommand/layout_main.cpp +++ b/src/subcommand/layout_main.cpp @@ -5,6 +5,7 @@ #include #include "algorithms/xp.hpp" #include "algorithms/pg_sgd/path_sgd_layout.hpp" +#include "algorithms/pg_sgd/path_sgd_helper.hpp" #include "algorithms/draw.hpp" #include "algorithms/layout.hpp" #include "hilbert.hpp" @@ -147,26 +148,7 @@ int main_layout(int argc, char **argv) { const double eps = !p_sgd_eps ? 0.01 : args::get(p_sgd_eps); const double sgd_delta = p_sgd_delta ? args::get(p_sgd_delta) : 0; const bool show_progress = progress ? args::get(progress) : false; - /// path guided linear 2D SGD sort helpers - // TODO beautify this, maybe put into its own file - std::function &, - const xp::XP &)> get_sum_path_step_count - = [&](const std::vector &path_sgd_use_paths, const xp::XP &path_index) { - uint64_t sum_path_step_count = 0; - for (auto& path : path_sgd_use_paths) { - sum_path_step_count += path_index.get_path_step_count(path); - } - return sum_path_step_count; - }; - std::function &, - const xp::XP &)> get_max_path_step_count - = [&](const std::vector &path_sgd_use_paths, const xp::XP &path_index) { - uint64_t max_path_step_count = 0; - for (auto& path : path_sgd_use_paths) { - max_path_step_count = std::max(max_path_step_count, path_index.get_path_step_count(path)); - } - return max_path_step_count; - }; + // default parameters /* We don't do this, yet. std::string path_sgd_seed; @@ -242,7 +224,7 @@ int main_layout(int argc, char **argv) { } - uint64_t sum_path_step_count = get_sum_path_step_count(path_sgd_use_paths, path_index); + uint64_t sum_path_step_count = algorithms::get_sum_path_step_count(path_sgd_use_paths, graph); if (args::get(p_sgd_min_term_updates_paths)) { path_sgd_min_term_updates = args::get(p_sgd_min_term_updates_paths) * sum_path_step_count; } else { @@ -252,7 +234,7 @@ int main_layout(int argc, char **argv) { path_sgd_min_term_updates = 10.0 * sum_path_step_count; } } - uint64_t max_path_step_count = get_max_path_step_count(path_sgd_use_paths, path_index); + uint64_t max_path_step_count = algorithms::get_max_path_step_count(path_sgd_use_paths, graph); path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? std::min(args::get(p_sgd_zipf_space), max_path_step_count) : max_path_step_count; double path_sgd_max_eta = args::get(p_sgd_eta_max) ? args::get(p_sgd_eta_max) : (double) max_path_step_count * max_path_step_count; diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 93c5f922..c8b237e8 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -13,6 +13,7 @@ #include "algorithms/linear_sgd.hpp" #include "algorithms/xp.hpp" #include "algorithms/pg_sgd/path_sgd.hpp" +#include "algorithms/pg_sgd/path_sgd_helper.hpp" #include "algorithms/groom.hpp" #include "algorithms/stepindex.hpp" @@ -177,45 +178,7 @@ int main_sort(int argc, char** argv) { graph.set_number_of_threads(num_threads); - /// path guided linear 1D SGD sort helpers - // TODO beautify this, maybe put into its own file - std::function &, - graph_t &)> get_sum_path_step_count - = [&](const std::vector &path_sgd_use_paths, graph_t &graph) { - uint64_t sum_path_step_count = 0; - for (auto& path : path_sgd_use_paths) { - sum_path_step_count += graph.get_step_count(path); - } - return sum_path_step_count; - }; - std::function &, - graph_t &)> get_max_path_step_count - = [&](const std::vector &path_sgd_use_paths, graph_t &graph) { - uint64_t max_path_step_count = 0; - for (auto& path : path_sgd_use_paths) { - max_path_step_count = std::max(max_path_step_count, graph.get_step_count(path)); - } - return max_path_step_count; - }; - std::function &, - const xp::XP &)> get_max_path_length_xp - = [&](const std::vector &path_sgd_use_paths, const xp::XP &path_index) { - uint64_t max_path_length = std::numeric_limits::min(); - for (auto &path : path_sgd_use_paths) { - max_path_length = std::max(max_path_length, path_index.get_path_length(path)); - } - return max_path_length; - }; - - std::function &, - const algorithms::step_index_t &)> get_max_path_length_ssi - = [&](const std::vector &path_sgd_use_paths, const algorithms::step_index_t &sampled_step_index) { - uint64_t max_path_length = std::numeric_limits::min(); - for (auto &path : path_sgd_use_paths) { - max_path_length = std::max(max_path_length, sampled_step_index.get_path_len(path)); - } - return max_path_length; - }; + // FIXME can we also do this for layout_main.cpp? // default parameters std::string path_sgd_seed; @@ -386,7 +349,7 @@ int main_sort(int argc, char** argv) { path_sgd_use_paths.push_back(path); }); } - uint64_t sum_path_step_count = get_sum_path_step_count(path_sgd_use_paths, graph); + uint64_t sum_path_step_count = algorithms::get_sum_path_step_count(path_sgd_use_paths, graph); if (args::get(p_sgd_min_term_updates_paths)) { path_sgd_min_term_updates = args::get(p_sgd_min_term_updates_paths) * sum_path_step_count; } else { @@ -396,8 +359,9 @@ int main_sort(int argc, char** argv) { path_sgd_min_term_updates = 1.0 * sum_path_step_count; } } - uint64_t max_path_step_count = get_max_path_step_count(path_sgd_use_paths, graph); - path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : get_max_path_length_xp(path_sgd_use_paths, path_index); + uint64_t max_path_step_count = algorithms::get_max_path_step_count(path_sgd_use_paths, graph); + // TODO we might have to use the SSI here + path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : algorithms::get_max_path_length_xp(path_sgd_use_paths, path_index); path_sgd_zipf_space_max = args::get(p_sgd_zipf_space_max) ? args::get(p_sgd_zipf_space_max) : 100; path_sgd_zipf_max_number_of_distributions = args::get(p_sgd_zipf_max_number_of_distributions) ? std::max( From b763c1064d6bb597d2b80440d2c3345a14e06726 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Tue, 25 Oct 2022 13:03:29 +0200 Subject: [PATCH 05/29] missing EOL --- src/algorithms/pg_sgd/path_sgd_helper.cpp | 2 +- src/algorithms/pg_sgd/path_sgd_helper.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_helper.cpp b/src/algorithms/pg_sgd/path_sgd_helper.cpp index 6b1750e8..6848ee12 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.cpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.cpp @@ -37,4 +37,4 @@ namespace odgi { return max_path_length; } } -} \ No newline at end of file +} diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp index 52d45ef1..48812a06 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.hpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -25,4 +25,4 @@ namespace odgi { const uint64_t get_max_path_length_ssi(const std::vector &path_sgd_use_paths, const algorithms::step_index_t &sampled_step_index); } -} \ No newline at end of file +} From 46597994baf240302a06bd7aa5655bfa80542ff1 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Tue, 25 Oct 2022 13:20:24 +0200 Subject: [PATCH 06/29] fix the FIXMEs is the plan --- src/subcommand/sort_main.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index c8b237e8..d4c8e33f 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -10,7 +10,6 @@ #include "algorithms/dagify_sort.hpp" #include "algorithms/random_order.hpp" //#include "algorithms/mondriaan_sort.hpp" -#include "algorithms/linear_sgd.hpp" #include "algorithms/xp.hpp" #include "algorithms/pg_sgd/path_sgd.hpp" #include "algorithms/pg_sgd/path_sgd_helper.hpp" @@ -178,8 +177,6 @@ int main_sort(int argc, char** argv) { graph.set_number_of_threads(num_threads); - // FIXME can we also do this for layout_main.cpp? - // default parameters std::string path_sgd_seed; if (p_sgd_seed) { @@ -198,6 +195,7 @@ int main_sort(int argc, char** argv) { } // even if no XP is given, it is likely we will build one automatically + // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow if (tmp_base) { xp::temp_file::set_dir(args::get(tmp_base)); } else { @@ -211,7 +209,7 @@ int main_sort(int argc, char** argv) { graph.optimize(); } - // TODO We have this function here, in untangle, maybe somewhere else? Refactor! + // FIXME We have this function here and in flip_main.cpp, groom_main.cpp, tips_main.cpp, and untangle_main.cpp // path loading auto load_paths = [&](const std::string& path_names_file) { std::ifstream path_names_in(path_names_file); @@ -247,6 +245,7 @@ int main_sort(int argc, char** argv) { return paths; }; + // FIXME put this into the helper auto sort_graph_by_target_paths = [&](graph_t& graph, std::vector target_paths, std::vector& is_ref) { std::vector target_order; std::fill_n(std::back_inserter(is_ref), graph.get_node_count(), false); @@ -304,8 +303,7 @@ int main_sort(int argc, char** argv) { uint64_t path_sgd_zipf_space, path_sgd_zipf_space_max, path_sgd_zipf_space_quantization_step, path_sgd_zipf_max_number_of_distributions; std::vector path_sgd_use_paths; xp::XP path_index; - // TODO add SSI here - // const algorithms::step_index_t &sampled_step_index + algorithms::step_index_t sampled_step_index; bool fresh_step_index = false; std::string snapshot_prefix; if (snapshot) { @@ -318,7 +316,7 @@ int main_sort(int argc, char** argv) { target_paths = load_paths(args::get(_p_sgd_target_paths)); sort_graph_by_target_paths(graph, target_paths, is_ref); } - // TODO take care of path index or sampled step index + // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow if (xp_in_file) { std::ifstream in; in.open(args::get(xp_in_file)); @@ -360,7 +358,7 @@ int main_sort(int argc, char** argv) { } } uint64_t max_path_step_count = algorithms::get_max_path_step_count(path_sgd_use_paths, graph); - // TODO we might have to use the SSI here + // FIXME we might have to use the SSI here path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : algorithms::get_max_path_length_xp(path_sgd_use_paths, path_index); path_sgd_zipf_space_max = args::get(p_sgd_zipf_space_max) ? args::get(p_sgd_zipf_space_max) : 100; @@ -425,7 +423,7 @@ int main_sort(int argc, char** argv) { break; case 'Y': { if (!fresh_step_index) { - // TODO what about the SSI? + // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow path_index.clean(); // do we have to sort by reference nodes first? if (_p_sgd_target_paths) { @@ -497,7 +495,8 @@ int main_sort(int argc, char** argv) { } else if (args::get(no_seeds)) { graph.apply_ordering(algorithms::topological_order(&graph, false, false, args::get(progress)), true); } else if (args::get(p_sgd)) { - std::vector order = + // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow + std::vector order = algorithms::path_linear_sgd_order(graph, path_index, path_sgd_use_paths, From 4b5f889338e3e510ad7eb6ef7983fa9a5a45ce88 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Tue, 25 Oct 2022 14:52:44 +0200 Subject: [PATCH 07/29] re-volt --- src/algorithms/pg_sgd/path_sgd_helper.cpp | 43 ++++++++++++ src/algorithms/pg_sgd/path_sgd_helper.hpp | 4 ++ src/subcommand/flip_main.cpp | 37 +--------- src/subcommand/groom_main.cpp | 38 +--------- src/subcommand/sort_main.cpp | 85 ++--------------------- src/subcommand/tips_main.cpp | 41 +---------- src/subcommand/untangle_main.cpp | 39 +---------- src/utils.cpp | 36 ++++++++++ src/utils.hpp | 3 + 9 files changed, 96 insertions(+), 230 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_helper.cpp b/src/algorithms/pg_sgd/path_sgd_helper.cpp index 6848ee12..bb597308 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.cpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.cpp @@ -36,5 +36,48 @@ namespace odgi { } return max_path_length; } + + const void sort_graph_by_target_paths(graph_t& graph, const std::vector &target_paths, + std::vector& is_ref, + const bool &progress) { + std::vector target_order; + std::fill_n(std::back_inserter(is_ref), graph.get_node_count(), false); + std::unique_ptr target_paths_progress; + if (progress) { + std::string banner = "[odgi::sort] preparing target path vectors:"; + target_paths_progress = std::make_unique(target_paths.size(), banner); + } + for (handlegraph::path_handle_t target_path: target_paths) { + graph.for_each_step_in_path( + target_path, + [&](const step_handle_t &step) { + handle_t handle = graph.get_handle_of_step(step); + uint64_t i = graph.get_id(handle) - 1; + if (!is_ref[i]) { + is_ref[i] = true; + target_order.push_back(handle); + } + }); + if (progress) { + target_paths_progress->increment(1); + } + } + if (progress) { + target_paths_progress->finish(); + } + uint64_t ref_nodes = 0; + for (uint64_t i = 0; i < is_ref.size(); i++) { + bool ref = is_ref[i]; + if (!ref) { + target_order.push_back(graph.get_handle(i + 1)); + ref_nodes++; + } + } + graph.apply_ordering(target_order, true); + + // refill is_ref with start->ref_nodes: 1 and ref_nodes->end: 0 + std::fill_n(is_ref.begin(), ref_nodes, true); + std::fill(is_ref.begin() + ref_nodes, is_ref.end(), false); + } } } diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp index 48812a06..fb5c8d26 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.hpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -24,5 +24,9 @@ namespace odgi { /// get the maximum path length in nucleotides using the SSI const uint64_t get_max_path_length_ssi(const std::vector &path_sgd_use_paths, const algorithms::step_index_t &sampled_step_index); + + const void sort_graph_by_target_paths(graph_t& graph, const std::vector &target_paths, + std::vector& is_ref, + const bool &progress); } } diff --git a/src/subcommand/flip_main.cpp b/src/subcommand/flip_main.cpp index 86c3f0cd..fd165012 100644 --- a/src/subcommand/flip_main.cpp +++ b/src/subcommand/flip_main.cpp @@ -79,44 +79,9 @@ int main_flip(int argc, char **argv) { graph.set_number_of_threads(num_threads); - // path loading - auto load_paths = [&](const std::string& path_names_file) { - std::ifstream path_names_in(path_names_file); - uint64_t num_of_paths_in_file = 0; - std::vector path_already_seen; - path_already_seen.resize(graph.get_path_count(), false); - std::string line; - std::vector paths; - while (std::getline(path_names_in, line)) { - if (!line.empty()) { - if (graph.has_path(line)) { - const path_handle_t path = graph.get_path_handle(line); - const uint64_t path_rank = as_integer(path) - 1; - if (!path_already_seen[path_rank]) { - path_already_seen[path_rank] = true; - paths.push_back(path); - } else { - std::cerr << "[odgi::untangle] error: in the path list there are duplicated path names." - << std::endl; - exit(1); - } - } - ++num_of_paths_in_file; - } - } - path_names_in.close(); - std::cerr << "[odgi::untangle] found " << paths.size() << "/" << num_of_paths_in_file - << " paths to consider." << std::endl; - if (paths.empty()) { - std::cerr << "[odgi::untangle] error: no path to consider." << std::endl; - exit(1); - } - return paths; - }; - std::vector no_flips; if (_no_flips) { - no_flips = load_paths(args::get(_no_flips)); + no_flips = utils::load_paths(args::get(_no_flips), graph, "flip"); } graph_t into; diff --git a/src/subcommand/groom_main.cpp b/src/subcommand/groom_main.cpp index 678020e9..88e0a612 100644 --- a/src/subcommand/groom_main.cpp +++ b/src/subcommand/groom_main.cpp @@ -73,45 +73,9 @@ int main_groom(int argc, char** argv) { } } - // TODO We have this function here, in untangle, maybe somewhere else? Refactor! - // path loading - auto load_paths = [&](const std::string& path_names_file) { - std::ifstream path_names_in(path_names_file); - uint64_t num_of_paths_in_file = 0; - std::vector path_already_seen; - path_already_seen.resize(graph.get_path_count(), false); - std::string line; - std::vector paths; - while (std::getline(path_names_in, line)) { - if (!line.empty()) { - if (graph.has_path(line)) { - const path_handle_t path = graph.get_path_handle(line); - const uint64_t path_rank = as_integer(path) - 1; - if (!path_already_seen[path_rank]) { - path_already_seen[path_rank] = true; - paths.push_back(path); - } else { - std::cerr << "[odgi::groom] error: in the path list there are duplicated path names." - << std::endl; - exit(1); - } - } - ++num_of_paths_in_file; - } - } - path_names_in.close(); - std::cerr << "[odgi::groom] found " << paths.size() << "/" << num_of_paths_in_file - << " paths to consider." << std::endl; - if (paths.empty()) { - std::cerr << "[odgi::groom] error: no path to consider." << std::endl; - exit(1); - } - return paths; - }; - std::vector target_paths; if (_target_paths) { - target_paths = load_paths(args::get(_target_paths)); + target_paths = utils::load_paths(args::get(_target_paths), graph, "groom"); } graph.apply_ordering(algorithms::groom(graph, progress, target_paths, !args::get(use_dfs))); diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index d4c8e33f..8938be4b 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -15,6 +15,7 @@ #include "algorithms/pg_sgd/path_sgd_helper.hpp" #include "algorithms/groom.hpp" #include "algorithms/stepindex.hpp" +#include "utils.hpp" namespace odgi { @@ -209,84 +210,6 @@ int main_sort(int argc, char** argv) { graph.optimize(); } - // FIXME We have this function here and in flip_main.cpp, groom_main.cpp, tips_main.cpp, and untangle_main.cpp - // path loading - auto load_paths = [&](const std::string& path_names_file) { - std::ifstream path_names_in(path_names_file); - uint64_t num_of_paths_in_file = 0; - std::vector path_already_seen; - path_already_seen.resize(graph.get_path_count(), false); - std::string line; - std::vector paths; - while (std::getline(path_names_in, line)) { - if (!line.empty()) { - if (graph.has_path(line)) { - const path_handle_t path = graph.get_path_handle(line); - const uint64_t path_rank = as_integer(path) - 1; - if (!path_already_seen[path_rank]) { - path_already_seen[path_rank] = true; - paths.push_back(path); - } else { - std::cerr << "[odgi::sort] error: in the path list there are duplicated path names." - << std::endl; - exit(1); - } - } - ++num_of_paths_in_file; - } - } - path_names_in.close(); - std::cerr << "[odgi::sort] found " << paths.size() << "/" << num_of_paths_in_file - << " paths to consider." << std::endl; - if (paths.empty()) { - std::cerr << "[odgi::sort] error: no path to consider." << std::endl; - exit(1); - } - return paths; - }; - - // FIXME put this into the helper - auto sort_graph_by_target_paths = [&](graph_t& graph, std::vector target_paths, std::vector& is_ref) { - std::vector target_order; - std::fill_n(std::back_inserter(is_ref), graph.get_node_count(), false); - std::unique_ptr target_paths_progress; - if (args::get(progress)) { - std::string banner = "[odgi::sort] preparing target path vectors:"; - target_paths_progress = std::make_unique(target_paths.size(), banner); - } - for (handlegraph::path_handle_t target_path: target_paths) { - graph.for_each_step_in_path( - target_path, - [&](const step_handle_t &step) { - handle_t handle = graph.get_handle_of_step(step); - uint64_t i = graph.get_id(handle) - 1; - if (!is_ref[i]) { - is_ref[i] = true; - target_order.push_back(handle); - } - }); - if (args::get(progress)) { - target_paths_progress->increment(1); - } - } - if (args::get(progress)) { - target_paths_progress->finish(); - } - uint64_t ref_nodes = 0; - for (uint64_t i = 0; i < is_ref.size(); i++) { - bool ref = is_ref[i]; - if (!ref) { - target_order.push_back(graph.get_handle(i + 1)); - ref_nodes++; - } - } - graph.apply_ordering(target_order, true); - - // refill is_ref with start->ref_nodes: 1 and ref_nodes->end: 0 - std::fill_n(is_ref.begin(), ref_nodes, true); - std::fill(is_ref.begin() + ref_nodes, is_ref.end(), false); - }; - uint64_t path_sgd_iter_max = args::get(p_sgd_iter_max) ? args::get(p_sgd_iter_max) : 100; uint64_t path_sgd_iter_max_learning_rate = args::get(p_sgd_iter_with_max_learning_rate) ? args::get(p_sgd_iter_with_max_learning_rate) : 0; double path_sgd_zipf_theta = args::get(p_sgd_zipf_theta) ? args::get(p_sgd_zipf_theta) : 0.99; @@ -313,8 +236,8 @@ int main_sort(int argc, char** argv) { std::vector target_paths; if (p_sgd || args::get(pipeline).find('Y') != std::string::npos) { if (_p_sgd_target_paths) { - target_paths = load_paths(args::get(_p_sgd_target_paths)); - sort_graph_by_target_paths(graph, target_paths, is_ref); + target_paths = utils::load_paths(args::get(_p_sgd_target_paths), graph, "sort"); + algorithms::sort_graph_by_target_paths(graph, target_paths, is_ref, args::get(progress)); } // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow if (xp_in_file) { @@ -427,7 +350,7 @@ int main_sort(int argc, char** argv) { path_index.clean(); // do we have to sort by reference nodes first? if (_p_sgd_target_paths) { - sort_graph_by_target_paths(graph, target_paths, is_ref); + algorithms::sort_graph_by_target_paths(graph, target_paths, is_ref, args::get(progress)); } path_index.from_handle_graph(graph, num_threads); } diff --git a/src/subcommand/tips_main.cpp b/src/subcommand/tips_main.cpp index 36c2f865..3ef8f293 100644 --- a/src/subcommand/tips_main.cpp +++ b/src/subcommand/tips_main.cpp @@ -88,43 +88,6 @@ namespace odgi { omp_set_num_threads(num_threads); - // path loading - auto load_paths = [&](const std::string& path_names_file) { - std::ifstream path_names_in(path_names_file); - uint64_t num_of_paths_in_file = 0; - std::vector path_already_seen; - path_already_seen.resize(graph.get_path_count(), false); - std::string line; - std::vector paths; - while (std::getline(path_names_in, line)) { - if (!line.empty()) { - if (graph.has_path(line)) { - const path_handle_t path = graph.get_path_handle(line); - const uint64_t path_rank = as_integer(path) - 1; - if (!path_already_seen[path_rank]) { - path_already_seen[path_rank] = true; - paths.push_back(path); - } else { - std::cerr << "[odgi::tips] error: in the path list there are duplicated path names." - << std::endl; - exit(1); - } - } - ++num_of_paths_in_file; - } - } - path_names_in.close(); - if (progress) { - std::cerr << "[odgi::tips] found " << paths.size() << "/" << num_of_paths_in_file - << " paths to consider." << std::endl; - } - if (paths.empty()) { - std::cerr << "[odgi::tips] error: no path to consider." << std::endl; - exit(1); - } - return paths; - }; - std::vector target_paths; std::vector query_paths; if (_target_path) { @@ -137,7 +100,7 @@ namespace odgi { exit(1); } } else if (_target_paths) { - target_paths = load_paths(args::get(_target_paths)); + target_paths = utils::load_paths(args::get(_target_paths), graph, "tips"); } else { target_paths.reserve(graph.get_path_count()); graph.for_each_path_handle([&](const path_handle_t path) { @@ -154,7 +117,7 @@ namespace odgi { exit(1); } } else if (_query_paths) { - query_paths = load_paths(args::get(_query_paths)); + query_paths = utils::load_paths(args::get(_query_paths), graph, "tips"); } else { query_paths.reserve(graph.get_path_count()); graph.for_each_path_handle([&](const path_handle_t path) { diff --git a/src/subcommand/untangle_main.cpp b/src/subcommand/untangle_main.cpp index 0f0b3477..cb30f0bc 100644 --- a/src/subcommand/untangle_main.cpp +++ b/src/subcommand/untangle_main.cpp @@ -112,41 +112,6 @@ int main_untangle(int argc, char **argv) { } } - // path loading - auto load_paths = [&](const std::string& path_names_file) { - std::ifstream path_names_in(path_names_file); - uint64_t num_of_paths_in_file = 0; - std::vector path_already_seen; - path_already_seen.resize(graph.get_path_count(), false); - std::string line; - std::vector paths; - while (std::getline(path_names_in, line)) { - if (!line.empty()) { - if (graph.has_path(line)) { - const path_handle_t path = graph.get_path_handle(line); - const uint64_t path_rank = as_integer(path) - 1; - if (!path_already_seen[path_rank]) { - path_already_seen[path_rank] = true; - paths.push_back(path); - } else { - std::cerr << "[odgi::untangle] error: in the path list there are duplicated path names." - << std::endl; - exit(1); - } - } - ++num_of_paths_in_file; - } - } - path_names_in.close(); - std::cerr << "[odgi::untangle] found " << paths.size() << "/" << num_of_paths_in_file - << " paths to consider." << std::endl; - if (paths.empty()) { - std::cerr << "[odgi::untangle] error: no path to consider." << std::endl; - exit(1); - } - return paths; - }; - std::vector target_paths; std::vector query_paths; if (_target_path) { @@ -159,7 +124,7 @@ int main_untangle(int argc, char **argv) { exit(1); } } else if (_target_paths) { - target_paths = load_paths(args::get(_target_paths)); + target_paths = utils::load_paths(args::get(_target_paths), graph, "untangle"); } else { target_paths.reserve(graph.get_path_count()); graph.for_each_path_handle([&](const path_handle_t path) { @@ -176,7 +141,7 @@ int main_untangle(int argc, char **argv) { exit(1); } } else if (_query_paths) { - query_paths = load_paths(args::get(_query_paths)); + query_paths = utils::load_paths(args::get(_query_paths), graph, "untangle"); } else { query_paths.reserve(graph.get_path_count()); graph.for_each_path_handle([&](const path_handle_t path) { diff --git a/src/utils.cpp b/src/utils.cpp index 0a39dc54..d756740b 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -85,4 +85,40 @@ namespace utils { uint64_t modulo(const uint64_t n, const uint64_t d) { return (n & (d - 1)); } + + std::vector load_paths(const std::string& path_names_file, + odgi::graph_t &graph, + const std::string subcommmand_name) { + std::ifstream path_names_in(path_names_file); + uint64_t num_of_paths_in_file = 0; + std::vector path_already_seen; + path_already_seen.resize(graph.get_path_count(), false); + std::string line; + std::vector paths; + while (std::getline(path_names_in, line)) { + if (!line.empty()) { + if (graph.has_path(line)) { + const path_handle_t path = graph.get_path_handle(line); + const uint64_t path_rank = as_integer(path) - 1; + if (!path_already_seen[path_rank]) { + path_already_seen[path_rank] = true; + paths.push_back(path); + } else { + std::cerr << "[odgi::" << subcommmand_name << "] error: in the path list there are duplicated path names." + << std::endl; + exit(1); + } + } + ++num_of_paths_in_file; + } + } + path_names_in.close(); + std::cerr << "[odgi::" << subcommmand_name << "] found " << paths.size() << "/" << num_of_paths_in_file + << " paths to consider." << std::endl; + if (paths.empty()) { + std::cerr << "[odgi::" << subcommmand_name << "] error: no path to consider." << std::endl; + exit(1); + } + return paths; + } } diff --git a/src/utils.hpp b/src/utils.hpp index 558b4a87..a6e1d750 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -18,6 +18,9 @@ namespace utils { /// this function will return n % d /// it is assumed that d is one of 1, 2, 4, 8, 16, 32, .... uint64_t modulo(const uint64_t n, const uint64_t d); + + // load path names from file, they are written line by line + std::vector load_paths(const std::string& path_names_file, odgi::graph_t &graph, const std::string subcommmand_name); } From 35f09b17ad7afdecb06cf438800a8d0c30445418 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Tue, 25 Oct 2022 16:59:23 +0200 Subject: [PATCH 08/29] SSI is the default route now --- src/algorithms/stepindex.cpp | 79 ++++++++++++++++++++++- src/algorithms/stepindex.hpp | 10 ++- src/subcommand/sort_main.cpp | 103 +++++++++++++++++++----------- src/subcommand/stepindex_main.cpp | 2 +- 4 files changed, 153 insertions(+), 41 deletions(-) diff --git a/src/algorithms/stepindex.cpp b/src/algorithms/stepindex.cpp index a246426c..9eedf9d9 100644 --- a/src/algorithms/stepindex.cpp +++ b/src/algorithms/stepindex.cpp @@ -8,8 +8,7 @@ step_index_t::step_index_t() { step_mphf = new boophf_step_t(); } - - +// FIXME replace this with "from_handle_graph" step_index_t::step_index_t(const PathHandleGraph& graph, const std::vector& paths, const uint64_t& nthreads, @@ -86,6 +85,82 @@ step_index_t::step_index_t(const PathHandleGraph& graph, } } +const void step_index_t::from_handle_graph(const PathHandleGraph& graph, + const std::vector& paths, + const uint64_t& nthreads, + const bool progress, + const uint64_t& sample_rate) { + this->sample_rate = sample_rate; + // iterate through the paths, recording steps in the structure we'll use to build the mphf + std::vector steps; + std::unique_ptr collecting_steps_progress_meter; + if (progress) { + collecting_steps_progress_meter = std::make_unique( + paths.size(), "[odgi::algorithms::stepindex] Collecting Steps Progress:"); + } + path_len.resize(paths.size()); +#pragma omp parallel for schedule(dynamic,1) + for (auto& path : paths) { + std::vector my_steps; + uint64_t path_length = 0; + graph.for_each_step_in_path( + path, [&](const step_handle_t& step) { + path_length += graph.get_length(graph.get_handle_of_step(step)); + // sampling + if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(step)), sample_rate)) { + my_steps.push_back(step); + } + }); + // sampling + if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(graph.path_end(path))), sample_rate)) { + my_steps.push_back(graph.path_end(path)); + } +#pragma omp critical (path_len) + path_len[as_integer(path) - 1] = path_length; +#pragma omp critical (steps_collect) + steps.insert(steps.end(), my_steps.begin(), my_steps.end()); + if(progress) { + collecting_steps_progress_meter->increment(1); + } + } + if (progress) { + collecting_steps_progress_meter->finish(); + } + // sort the steps + ips4o::parallel::sort(steps.begin(), steps.end(), std::less<>(), nthreads); + // build the hash function (quietly) + step_mphf = new boophf_step_t(steps.size(), steps, nthreads, 2.0, false, false); + // use the hash function to record the step positions + pos.resize(steps.size()); + std::unique_ptr building_progress_meter; + if (progress) { + building_progress_meter = std::make_unique( + paths.size(), "[odgi::algorithms::stepindex] Building Progress:"); + } +#pragma omp parallel for schedule(dynamic,1) + for (auto& path : paths) { + uint64_t offset = 0; + graph.for_each_step_in_path( + path, [&](const step_handle_t& step) { + // sampling + if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(step)), sample_rate)) { + pos[step_mphf->lookup(step)] = offset; + } + offset += graph.get_length(graph.get_handle_of_step(step)); + }); + // sampling + if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(graph.path_end(path))), sample_rate)) { + pos[step_mphf->lookup(graph.path_end(path))] = offset; + } + if (progress) { + building_progress_meter->increment(1); + } + } + if (progress) { + building_progress_meter->finish(); + } +} + const uint64_t step_index_t::get_position(const step_handle_t& step, const PathHandleGraph& graph) const { // is our step already in a node that we indexed? handle_t h = graph.get_handle_of_step(step); diff --git a/src/algorithms/stepindex.hpp b/src/algorithms/stepindex.hpp index 66684c9c..a963aed5 100644 --- a/src/algorithms/stepindex.hpp +++ b/src/algorithms/stepindex.hpp @@ -47,7 +47,8 @@ typedef boomphf::mphf> boophf_uin struct step_index_t { step_index_t(); - step_index_t(const PathHandleGraph& graph, + // FIXME replace this with "from_handle_graph" + step_index_t(const PathHandleGraph& graph, const std::vector& paths, const uint64_t& nthreads, const bool progress, @@ -59,6 +60,13 @@ struct step_index_t { step_index_t& operator=(const step_index_t& other) = delete; step_index_t& operator=(step_index_t&& other) = delete; + /// This feels like cheating the actual constructor, but maybe I should just remove it + const void from_handle_graph(const PathHandleGraph& graph, + const std::vector& paths, + const uint64_t& nthreads, + const bool progress, + const uint64_t& sample_rate); + const uint64_t get_position(const step_handle_t& step, const PathHandleGraph& graph) const; const uint64_t get_path_len(const path_handle_t& path) const; void save(const std::string& name) const; diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 8938be4b..78b142d5 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -62,6 +62,7 @@ int main_sort(int argc, char** argv) { /// path guided linear 1D SGD args::Group pg_sgd_opts(parser, "[ Path Guided 1D SGD Sort ]"); args::Flag p_sgd(pg_sgd_opts, "path-sgd", "Apply the path-guided linear 1D SGD algorithm to organize graph.", {'Y', "path-sgd"}); + args::Flag p_sgd_xp(pg_sgd_opts, "path-sgd", "Use the faster but memory expensive XP index.", {'m', "path-sgd-path-index"}); args::ValueFlag p_sgd_in_file(pg_sgd_opts, "FILE", "Specify a line separated list of paths to sample from for the on the" " fly term generation process in the path guided linear 1D SGD (default: sample from all paths).", {'f', "path-sgd-use-paths"}); args::ValueFlag p_sgd_min_term_updates_paths(pg_sgd_opts, "N", "The minimum number of terms to be updated before a new path guided" @@ -152,9 +153,18 @@ int main_sort(int argc, char** argv) { return 1; } - if (xp_in_file && ssi_in_file) { + const bool xp_way = xp_in_file || p_sgd_xp; + + if (xp_way && ssi_in_file) { + std::cerr + << "[odgi::sort] error: please specify exclusively -X=[FILE]/--path-index=[FILE], -m/--path-sgd-path-index, or -e=[FILE]/--sampled-step-index=[FILE]." + << std::endl; + return 1; + } + + if (xp_in_file && p_sgd_xp) { std::cerr - << "[odgi::sort] error: the PG-SGD algorithm can only work with an XP (odgi pathindex) or with a sample step index (odgi stepindex). Please only enter one of those." + << "[odgi::sort] error: please specify -X=[FILE]/--path-index=[FILE] or -m=/--path-sgd-path-index, not both." << std::endl; return 1; } @@ -195,15 +205,18 @@ int main_sort(int argc, char** argv) { return 1; } - // even if no XP is given, it is likely we will build one automatically - // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow - if (tmp_base) { - xp::temp_file::set_dir(args::get(tmp_base)); - } else { - char* cwd = get_current_dir_name(); - xp::temp_file::set_dir(std::string(cwd)); - free(cwd); - } + // SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow + // if an XP is given or we want to work with an XP based PG-SGD, we should set the temporary file + if (xp_way) { + std::cerr << "[odgi::sort] using XP index for the PG-SGD algorithm" << std::endl; + if (tmp_base) { + xp::temp_file::set_dir(args::get(tmp_base)); + } else { + char* cwd = get_current_dir_name(); + xp::temp_file::set_dir(std::string(cwd)); + free(cwd); + } + } // If required, first of all, optimize the graph so that it is optimized for subsequent algorithms (if required) if (args::get(optimize)) { @@ -232,44 +245,56 @@ int main_sort(int argc, char** argv) { if (snapshot) { snapshot_prefix = args::get(p_sgd_snapshot); } + // do we only want so sample from a subset of paths? + if (p_sgd_in_file) { + std::string buf; + std::ifstream use_paths(args::get(p_sgd_in_file).c_str()); + while (std::getline(use_paths, buf)) { + // check if the path is actually in the graph, else print an error and exit 1 + if (graph.has_path(buf)) { + path_sgd_use_paths.push_back(graph.get_path_handle(buf)); + } else { + std::cerr << "[odgi::sort] error: path '" << buf + << "' as was given by -f=[FILE], --path-sgd-use-paths=[FILE]" + " is not present in the graph. Please remove this path from the file and restart 'odgi sort'."; + } + } + use_paths.close(); + } else { + graph.for_each_path_handle( + [&](const path_handle_t &path) { + path_sgd_use_paths.push_back(path); + }); + } std::vector is_ref; std::vector target_paths; + uint64_t ssi_sample_rate = 8; if (p_sgd || args::get(pipeline).find('Y') != std::string::npos) { if (_p_sgd_target_paths) { target_paths = utils::load_paths(args::get(_p_sgd_target_paths), graph, "sort"); algorithms::sort_graph_by_target_paths(graph, target_paths, is_ref, args::get(progress)); } - // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow + // SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow if (xp_in_file) { std::ifstream in; in.open(args::get(xp_in_file)); path_index.load(in); in.close(); - } else { + } else if (p_sgd_xp) { + // only if we set the flag! path_index.from_handle_graph(graph, num_threads); - } + // do we load the SSI from file? + } else if (ssi_in_file) { + sampled_step_index.load(args::get(ssi_in_file)); + // store the SSI sampling rate + ssi_sample_rate = sampled_step_index.get_sample_rate(); + // or do we build it from scratch? + } else { + // stepindex only for selected paths + sampled_step_index.from_handle_graph(graph, path_sgd_use_paths, num_threads, args::get(progress), ssi_sample_rate); + } fresh_step_index = true; - // do we only want so sample from a subset of paths? - if (p_sgd_in_file) { - std::string buf; - std::ifstream use_paths(args::get(p_sgd_in_file).c_str()); - while (std::getline(use_paths, buf)) { - // check if the path is actually in the graph, else print an error and exit 1 - if (graph.has_path(buf)) { - path_sgd_use_paths.push_back(graph.get_path_handle(buf)); - } else { - std::cerr << "[odgi::sort] error: path '" << buf - << "' as was given by -f=[FILE], --path-sgd-use-paths=[FILE]" - " is not present in the graph. Please remove this path from the file and restart 'odgi sort'."; - } - } - use_paths.close(); - } else { - graph.for_each_path_handle( - [&](const path_handle_t &path) { - path_sgd_use_paths.push_back(path); - }); - } + uint64_t sum_path_step_count = algorithms::get_sum_path_step_count(path_sgd_use_paths, graph); if (args::get(p_sgd_min_term_updates_paths)) { path_sgd_min_term_updates = args::get(p_sgd_min_term_updates_paths) * sum_path_step_count; @@ -281,8 +306,12 @@ int main_sort(int argc, char** argv) { } } uint64_t max_path_step_count = algorithms::get_max_path_step_count(path_sgd_use_paths, graph); - // FIXME we might have to use the SSI here - path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : algorithms::get_max_path_length_xp(path_sgd_use_paths, path_index); + // we might have to use the SSI here + if (xp_way) { + path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : algorithms::get_max_path_length_xp(path_sgd_use_paths, path_index); + } else { + path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : algorithms::get_max_path_length_ssi(path_sgd_use_paths, sampled_step_index); + } path_sgd_zipf_space_max = args::get(p_sgd_zipf_space_max) ? args::get(p_sgd_zipf_space_max) : 100; path_sgd_zipf_max_number_of_distributions = args::get(p_sgd_zipf_max_number_of_distributions) ? std::max( diff --git a/src/subcommand/stepindex_main.cpp b/src/subcommand/stepindex_main.cpp index 39bc96ed..8b1e22b6 100644 --- a/src/subcommand/stepindex_main.cpp +++ b/src/subcommand/stepindex_main.cpp @@ -62,7 +62,7 @@ namespace odgi { exit(1); } - std::string step_index_out_file = dg_out_file ? args::get(dg_out_file) : (args::get(og_file) + ".stpidx"); + std::string step_index_out_file = dg_out_file ? args::get(dg_out_file) : (args::get(og_file) + ".ssi"); const uint64_t num_threads = args::get(nthreads) ? args::get(nthreads) : 1; From 3575a2eb3aec42d72b5d2994917dab2935b69fdb Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 27 Oct 2022 11:08:59 +0200 Subject: [PATCH 09/29] we can clean the ssi --- src/algorithms/stepindex.cpp | 8 ++++++++ src/algorithms/stepindex.hpp | 1 + src/subcommand/sort_main.cpp | 15 ++++++++++++--- src/unittest/sort.cpp | 1 + src/unittest/stepindex.cpp | 14 ++++++++++++-- 5 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/algorithms/stepindex.cpp b/src/algorithms/stepindex.cpp index 9eedf9d9..80af23c5 100644 --- a/src/algorithms/stepindex.cpp +++ b/src/algorithms/stepindex.cpp @@ -295,6 +295,14 @@ step_index_t::~step_index_t(void) { delete step_mphf; } +void step_index_t::clean() { + delete this->step_mphf; + // this should free all allocated memory https://www.techiedelight.com/delete-vector-free-memory-cpp/ + sdsl::int_vector<64>().swap(this->pos); + sdsl::int_vector<64>().swap(this->path_len); + this->sample_rate = 0; +} + // path step index diff --git a/src/algorithms/stepindex.hpp b/src/algorithms/stepindex.hpp index a963aed5..72ec8fcf 100644 --- a/src/algorithms/stepindex.hpp +++ b/src/algorithms/stepindex.hpp @@ -72,6 +72,7 @@ struct step_index_t { void save(const std::string& name) const; void load(const std::string& name); const uint64_t get_sample_rate(); + void clean(); // map from step to position in its path boophf_step_t* step_mphf = nullptr; sdsl::int_vector<64> pos; diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 78b142d5..b087e215 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -375,13 +375,20 @@ int main_sort(int argc, char** argv) { break; case 'Y': { if (!fresh_step_index) { - // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow - path_index.clean(); + if (xp_way) { + path_index.clean(); + } else { + sampled_step_index.clean(); + } // do we have to sort by reference nodes first? if (_p_sgd_target_paths) { algorithms::sort_graph_by_target_paths(graph, target_paths, is_ref, args::get(progress)); } - path_index.from_handle_graph(graph, num_threads); + if (xp_way) { + path_index.from_handle_graph(graph, num_threads); + } else { + sampled_step_index.from_handle_graph(graph, path_sgd_use_paths, num_threads, args::get(progress), ssi_sample_rate); + } } order = algorithms::path_linear_sgd_order(graph, path_index, @@ -404,6 +411,7 @@ int main_sort(int argc, char** argv) { snapshot_prefix, _p_sgd_target_paths, is_ref); + // FIXME SSI HERE break; } case 'f': @@ -470,6 +478,7 @@ int main_sort(int argc, char** argv) { snapshot_prefix, _p_sgd_target_paths, is_ref); + // FIXME SSI HERE graph.apply_ordering(order, true); } else if (args::get(breadth_first)) { graph.apply_ordering(algorithms::breadth_first_topological_order(graph, bf_chunk_size), true); diff --git a/src/unittest/sort.cpp b/src/unittest/sort.cpp index 9cd2f071..c395116e 100644 --- a/src/unittest/sort.cpp +++ b/src/unittest/sort.cpp @@ -126,6 +126,7 @@ TEST_CASE("Sorting a graph with paths", "[sort]") { } } +// FIXME add tests using SSI sorting TEST_CASE("Sorting a graph with paths 1 node long", "[sort]") { graph_t graph; diff --git a/src/unittest/stepindex.cpp b/src/unittest/stepindex.cpp index 5f2bc180..991c0bf4 100644 --- a/src/unittest/stepindex.cpp +++ b/src/unittest/stepindex.cpp @@ -79,6 +79,8 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 1.") { step_index_t step_index_1(graph, paths, 1, false, 1); + step_index_1.clean(); + step_index_1.from_handle_graph(graph, paths, 1, false, 1); REQUIRE(step_index_1.get_sample_rate() == 1); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); @@ -189,6 +191,8 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 2.") { step_index_t step_index_2(graph, paths, 1, false, 2); + step_index_2.clean(); + step_index_2.from_handle_graph(graph, paths, 1, false, 2); REQUIRE(step_index_2.get_sample_rate() == 2); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); @@ -283,6 +287,8 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 4.") { step_index_t step_index_4(graph, paths, 1, false, 4); + step_index_4.clean(); + step_index_4.from_handle_graph(graph, paths, 1, false, 4); REQUIRE(step_index_4.get_sample_rate() == 4); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); @@ -377,6 +383,8 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 8.") { step_index_t step_index_8(graph, paths, 1, false, 8); + step_index_8.clean(); + step_index_8.from_handle_graph(graph, paths, 1, false, 8); REQUIRE(step_index_8.get_sample_rate() == 8); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); @@ -471,6 +479,8 @@ namespace odgi { SECTION("The index delivers the correct positions for a given step. Sample rate: 16.") { step_index_t step_index_16(graph, paths, 1, false, 16); + step_index_16.clean(); + step_index_16.from_handle_graph(graph, paths, 1, false, 16); REQUIRE(step_index_16.get_sample_rate() == 16); graph.for_each_path_handle([&](const path_handle_t path) { std::string cur_path = graph.get_path_name(path); @@ -567,10 +577,10 @@ namespace odgi { step_index_t step_index_to_save(graph, paths, 1, false, 8); // Write index to temporary file in preparation for the next tests. std::string basename = xp::temp_file::create(); - step_index_to_save.save(basename + "unittest.stpidx"); + step_index_to_save.save(basename + "unittest.ssi"); step_index_t step_index_loaded; - step_index_loaded.load(basename + "unittest.stpidx"); + step_index_loaded.load(basename + "unittest.ssi"); REQUIRE(step_index_to_save.get_sample_rate() == 8); REQUIRE(step_index_loaded.get_sample_rate() == 8); From 2b7896a7a7a261f595724f9c88eb4fe227adf1f4 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 27 Oct 2022 11:27:53 +0200 Subject: [PATCH 10/29] CLEANERS --- src/algorithms/stepindex.cpp | 71 +----------------------------------- src/algorithms/stepindex.hpp | 6 +-- 2 files changed, 4 insertions(+), 73 deletions(-) diff --git a/src/algorithms/stepindex.cpp b/src/algorithms/stepindex.cpp index 80af23c5..9dc94b2c 100644 --- a/src/algorithms/stepindex.cpp +++ b/src/algorithms/stepindex.cpp @@ -8,81 +8,12 @@ step_index_t::step_index_t() { step_mphf = new boophf_step_t(); } -// FIXME replace this with "from_handle_graph" step_index_t::step_index_t(const PathHandleGraph& graph, const std::vector& paths, const uint64_t& nthreads, const bool progress, const uint64_t& sample_rate) { - this->sample_rate = sample_rate; - // iterate through the paths, recording steps in the structure we'll use to build the mphf - std::vector steps; - std::unique_ptr collecting_steps_progress_meter; - if (progress) { - collecting_steps_progress_meter = std::make_unique( - paths.size(), "[odgi::algorithms::stepindex] Collecting Steps Progress:"); - } - path_len.resize(paths.size()); -#pragma omp parallel for schedule(dynamic,1) - for (auto& path : paths) { - std::vector my_steps; - uint64_t path_length = 0; - graph.for_each_step_in_path( - path, [&](const step_handle_t& step) { - path_length += graph.get_length(graph.get_handle_of_step(step)); - // sampling - if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(step)), sample_rate)) { - my_steps.push_back(step); - } - }); - // sampling - if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(graph.path_end(path))), sample_rate)) { - my_steps.push_back(graph.path_end(path)); - } -#pragma omp critical (path_len) - path_len[as_integer(path) - 1] = path_length; -#pragma omp critical (steps_collect) - steps.insert(steps.end(), my_steps.begin(), my_steps.end()); - if(progress) { - collecting_steps_progress_meter->increment(1); - } - } - if (progress) { - collecting_steps_progress_meter->finish(); - } - // sort the steps - ips4o::parallel::sort(steps.begin(), steps.end(), std::less<>(), nthreads); - // build the hash function (quietly) - step_mphf = new boophf_step_t(steps.size(), steps, nthreads, 2.0, false, false); - // use the hash function to record the step positions - pos.resize(steps.size()); - std::unique_ptr building_progress_meter; - if (progress) { - building_progress_meter = std::make_unique( - paths.size(), "[odgi::algorithms::stepindex] Building Progress:"); - } -#pragma omp parallel for schedule(dynamic,1) - for (auto& path : paths) { - uint64_t offset = 0; - graph.for_each_step_in_path( - path, [&](const step_handle_t& step) { - // sampling - if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(step)), sample_rate)) { - pos[step_mphf->lookup(step)] = offset; - } - offset += graph.get_length(graph.get_handle_of_step(step)); - }); - // sampling - if (0 == utils::modulo(graph.get_id(graph.get_handle_of_step(graph.path_end(path))), sample_rate)) { - pos[step_mphf->lookup(graph.path_end(path))] = offset; - } - if (progress) { - building_progress_meter->increment(1); - } - } - if (progress) { - building_progress_meter->finish(); - } + this->from_handle_graph(graph, paths, nthreads, progress, sample_rate); } const void step_index_t::from_handle_graph(const PathHandleGraph& graph, diff --git a/src/algorithms/stepindex.hpp b/src/algorithms/stepindex.hpp index 72ec8fcf..812aa1fe 100644 --- a/src/algorithms/stepindex.hpp +++ b/src/algorithms/stepindex.hpp @@ -47,20 +47,20 @@ typedef boomphf::mphf> boophf_uin struct step_index_t { step_index_t(); - // FIXME replace this with "from_handle_graph" + /// Delete this in favour of this->from_handle_graph? For now, we have more interfaces. step_index_t(const PathHandleGraph& graph, const std::vector& paths, const uint64_t& nthreads, const bool progress, const uint64_t& sample_rate); ~step_index_t(void); - // We cannot move, assign, or copy until we add code to point SDSL supports at the new addresses for their vectors. + /// We cannot move, assign, or copy until we add code to point SDSL supports at the new addresses for their vectors. Taken from the XP index implementation. step_index_t(const step_index_t& other) = delete; step_index_t(step_index_t&& other) = delete; step_index_t& operator=(const step_index_t& other) = delete; step_index_t& operator=(step_index_t&& other) = delete; - /// This feels like cheating the actual constructor, but maybe I should just remove it + /// This feels like cheating the actual constructor, but how else could I invoke it again? const void from_handle_graph(const PathHandleGraph& graph, const std::vector& paths, const uint64_t& nthreads, From 3fbd88795c0d0b2d3828eec4f50d28789ce6e7c5 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 27 Oct 2022 14:24:28 +0200 Subject: [PATCH 11/29] ugly skeletor --- src/algorithms/pg_sgd/path_sgd.hpp | 7 +- src/algorithms/pg_sgd/path_sgd_helper.hpp | 6 + src/algorithms/pg_sgd/path_sgd_ssi.cpp | 688 ++++++++++++++++++++++ src/algorithms/pg_sgd/path_sgd_ssi.hpp | 86 +++ src/subcommand/sort_main.cpp | 137 +++-- 5 files changed, 873 insertions(+), 51 deletions(-) create mode 100644 src/algorithms/pg_sgd/path_sgd_ssi.cpp create mode 100644 src/algorithms/pg_sgd/path_sgd_ssi.hpp diff --git a/src/algorithms/pg_sgd/path_sgd.hpp b/src/algorithms/pg_sgd/path_sgd.hpp index 7dd43701..c8cbc25f 100644 --- a/src/algorithms/pg_sgd/path_sgd.hpp +++ b/src/algorithms/pg_sgd/path_sgd.hpp @@ -22,6 +22,7 @@ #include "XoshiroCpp.hpp" #include "../progress.hpp" #include "utils.hpp" +#include "path_sgd_helper.hpp" #include @@ -30,12 +31,6 @@ namespace algorithms { using namespace handlegraph; -struct handle_layout_t { - uint64_t weak_component = 0; - double pos = 0; - handle_t handle = as_handle(0); -}; - /// use SGD driven, by path guided, and partly zipfian distribution sampled pairwise distances to obtain a 1D linear layout of the graph that respects its topology std::vector path_linear_sgd(const graph_t &graph, const xp::XP &path_index, diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp index fb5c8d26..5c76dcf2 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.hpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -9,6 +9,12 @@ namespace odgi { using namespace handlegraph; + struct handle_layout_t { + uint64_t weak_component = 0; + double pos = 0; + handle_t handle = as_handle(0); + }; + /// get the total number of steps in the graph const uint64_t get_sum_path_step_count(const std::vector &path_sgd_use_paths, graph_t &graph); diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp new file mode 100644 index 00000000..7d6cd36a --- /dev/null +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -0,0 +1,688 @@ +#include "path_sgd_ssi.hpp" + +//#define debug_path_sgd +// #define eval_path_sgd +// #define debug_schedule +// # define debug_sample_from_nodes +namespace odgi { + namespace algorithms { + + std::vector path_linear_sgd_ssi(graph_t &graph, + const algorithms::step_index_t &sampled_step_index, + const std::vector &path_sgd_use_paths, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const uint64_t &min_term_updates, + const double &delta, + const double &eps, + const double &eta_max, + const double &theta, + const uint64_t &space, + const uint64_t &space_max, + const uint64_t &space_quantization_step, + const double &cooling_start, + const uint64_t &nthreads, + const bool &progress, + const bool &snapshot, + std::vector &snapshots, + const bool &target_sorting, + std::vector& target_nodes) { +#ifdef debug_path_sgd + std::cerr << "iter_max: " << iter_max << std::endl; + std::cerr << "min_term_updates: " << min_term_updates << std::endl; + std::cerr << "delta: " << delta << std::endl; + std::cerr << "eps: " << eps << std::endl; + std::cerr << "theta: " << theta << std::endl; + std::cerr << "space: " << space << std::endl; + std::cerr << "space_max: " << space_max << std::endl; + std::cerr << "space_quantization_step: " << space_quantization_step << std::endl; + std::cerr << "cooling_start: " << cooling_start << std::endl; +#endif + + uint64_t first_cooling_iteration = std::floor(cooling_start * (double)iter_max); + //std::cerr << "first cooling iteration " << first_cooling_iteration << std::endl; + + /// FIXME this is just so that it runs + xp::XP path_index; + path_index.from_handle_graph(graph, nthreads); + + uint64_t total_term_updates = iter_max * min_term_updates; + std::unique_ptr progress_meter; + if (progress) { + progress_meter = std::make_unique( + total_term_updates, "[odgi::path_linear_sgd] 1D path-guided SGD:"); + } + using namespace std::chrono_literals; // for timing stuff + uint64_t num_nodes = graph.get_node_count(); + // our positions in 1D + std::vector> X(num_nodes); + atomic snapshot_in_progress; + snapshot_in_progress.store(false); + std::vector> snapshot_progress(iter_max); + // we will produce one less snapshot compared to iterations + snapshot_progress[0].store(true); + // seed them with the graph order + uint64_t len = 0; + graph.for_each_handle( + [&X, &graph, &len](const handle_t &handle) { + // nb: we assume that the graph provides a compact handle set + X[number_bool_packing::unpack_number(handle)].store(len); + len += graph.get_length(handle); + }); + // the longest path length measured in nucleotides + //size_t longest_path_in_nucleotides = 0; + // the total path length in nucleotides + //size_t total_path_len_in_nucleotides = 0; + // here we store all path nucleotides lengths so we know later from which path we sampled our random position from + //IITree path_nucleotide_tree; + // iterate over all relevant path_handles: + // 1. build the interval tree + // 2. find out the longest path in nucleotides and store this number size_t + // 3. add the current path length to the total length + bool at_least_one_path_with_more_than_one_step = false; + + for (auto &path : path_sgd_use_paths) { +#ifdef debug_path_sgd + std::string path_name = graph.get_path_name(path); + std::cerr << path_name << std::endl; + std::cerr << as_integer(path) << std::endl; +#endif +#ifdef debug_path_sgd + size_t path_len = path_index.get_path_length(path); + std::cerr << path_name << " has length: " << path_len << std::endl; +#endif + //path_nucleotide_tree.add(total_path_len_in_nucleotides, total_path_len_in_nucleotides + path_len, path); + + //if (path_len > longest_path_in_nucleotides) { + // longest_path_in_nucleotides = path_len; + //} + //total_path_len_in_nucleotides += path_len; + + if (path_index.get_path_step_count(path) > 1){ + at_least_one_path_with_more_than_one_step = true; + break; + } + } + //path_nucleotide_tree.index(); + + if (at_least_one_path_with_more_than_one_step){ + double w_min = (double) 1.0 / (double) (eta_max); + +#ifdef debug_path_sgd + std::cerr << "w_min " << w_min << std::endl; +#endif + double w_max = 1.0; + // get our schedule + if (progress) { + std::cerr << "[odgi::path_linear_sgd] calculating linear SGD schedule (" << w_min << " " << w_max << " " + << iter_max << " " << iter_with_max_learning_rate << " " << eps << ")" << std::endl; + } + std::vector etas = path_linear_sgd_schedule_ssi(w_min, + w_max, + iter_max, + iter_with_max_learning_rate, + eps); + + // cache zipf zetas for our full path space (heavy, but one-off) + if (progress) { + std::cerr << "[odgi::path_linear_sgd] calculating zetas for " << (space <= space_max ? space : space_max + (space - space_max) / space_quantization_step + 1) << " zipf distributions" << std::endl; + } + + std::vector zetas((space <= space_max ? space : space_max + (space - space_max) / space_quantization_step + 1)+1); + uint64_t last_quantized_i = 0; + // FIXME add progress bar here? +#pragma omp parallel for schedule(static,1) + for (uint64_t i = 1; i < space+1; ++i) { + uint64_t quantized_i = i; + uint64_t compressed_space = i; + if (i > space_max){ + quantized_i = space_max + (i - space_max) / space_quantization_step + 1; + compressed_space = space_max + ((i - space_max) / space_quantization_step) * space_quantization_step; + } + + if (quantized_i != last_quantized_i){ + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, compressed_space, theta); + zetas[quantized_i] = z_p.zeta(); + + last_quantized_i = quantized_i; + } + } + + // how many term updates we make + std::atomic term_updates; + term_updates.store(0); + // learning rate + std::atomic eta; + eta.store(etas.front()); + // adaptive zip theta + std::atomic adj_theta; + adj_theta.store(theta); + // if we're in a final cooling phase (last 10%) of iterations + std::atomic cooling; + cooling.store(false); + // our max delta + std::atomic Delta_max; + Delta_max.store(0); + // should we keep working? + std::atomic work_todo; + work_todo.store(true); + // approximately what iteration we're on + uint64_t iteration = 0; + // launch a thread to update the learning rate, count iterations, and decide when to stop + auto checker_lambda = + [&]() { + while (work_todo.load()) { + if (term_updates.load() > min_term_updates) { + if (snapshot) { + if (snapshot_progress[iteration].load() || iteration == iter_max) { + iteration++; + if (iteration == iter_max) { + snapshot_in_progress.store(false); + } else { + snapshot_in_progress.store(true); + } + } else { + snapshot_in_progress.store(true); + continue; + } + } else { + iteration++; + snapshot_in_progress.store(false); + } + if (iteration > iter_max) { + work_todo.store(false); + } else if (Delta_max.load() <= delta) { // nb: this will also break at 0 + if (progress) { + std::cerr << "[odgi::path_linear_sgd] delta_max: " << Delta_max.load() + << " <= delta: " + << delta << ". Threshold reached, therefore ending iterations." + << std::endl; + } + work_todo.store(false); + } else { + eta.store(etas[iteration]); // update our learning rate + Delta_max.store(delta); // set our delta max to the threshold + if (iteration > first_cooling_iteration) { + adj_theta.store(0.001); + cooling.store(true); + } + } + term_updates.store(0); + } + std::this_thread::sleep_for(1ms); + } + }; + + auto worker_lambda = + [&](uint64_t tid) { + // everyone tries to seed with their own random data + const std::uint64_t seed = 9399220 + tid; + XoshiroCpp::Xoshiro256Plus gen(seed); // a nice, fast PRNG + // FIXME these should be replaced by the bv and ssi + // some references to literal bitvectors in the path index hmmm + const sdsl::bit_vector &np_bv = path_index.get_np_bv(); + const sdsl::int_vector<> &nr_iv = path_index.get_nr_iv(); + const sdsl::int_vector<> &npi_iv = path_index.get_npi_iv(); + // we'll sample from all path steps + std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, np_bv.size() - 1); + std::uniform_int_distribution flip(0, 1); + uint64_t term_updates_local = 0; + while (work_todo.load()) { + if (!snapshot_in_progress.load()) { + // sample the first node from all the nodes in the graph + // pick a random position from all paths + uint64_t step_index = dis_step(gen); +#ifdef debug_sample_from_nodes + std::cerr << "step_index: " << step_index << std::endl; +#endif + // FIXME we should use a bv here with rank and select + uint64_t path_i = npi_iv[step_index]; + path_handle_t path = as_path_handle(path_i); + + // FIXME replace with ODGI + size_t path_step_count = path_index.get_path_step_count(path); + if (path_step_count == 1){ + continue; + } + +#ifdef debug_sample_from_nodes + std::cerr << "path integer: " << path_i << std::endl; +#endif + step_handle_t step_a, step_b; + as_integers(step_a)[0] = path_i; // path index + // FIXME use the bv here + size_t s_rank = nr_iv[step_index] - 1; // step rank in path + as_integers(step_a)[1] = s_rank; +#ifdef debug_sample_from_nodes + std::cerr << "step rank in path: " << nr_iv[step_index] << std::endl; +#endif + + if (cooling.load() || flip(gen)) { + auto _theta = adj_theta.load(); + if (s_rank > 0 && flip(gen) || s_rank == path_step_count-1) { + // go backward + uint64_t jump_space = std::min(space, s_rank); + uint64_t space = jump_space; + if (jump_space > space_max){ + space = space_max + (jump_space - space_max) / space_quantization_step + 1; + } + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, jump_space, _theta, zetas[space]); + dirtyzipf::dirty_zipfian_int_distribution z(z_p); + uint64_t z_i = z(gen); + //assert(z_i <= path_space); + as_integers(step_b)[0] = as_integer(path); + as_integers(step_b)[1] = s_rank - z_i; + } else { + // go forward + uint64_t jump_space = std::min(space, path_step_count - s_rank - 1); + uint64_t space = jump_space; + if (jump_space > space_max){ + space = space_max + (jump_space - space_max) / space_quantization_step + 1; + } + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, jump_space, _theta, zetas[space]); + dirtyzipf::dirty_zipfian_int_distribution z(z_p); + uint64_t z_i = z(gen); + //assert(z_i <= path_space); + as_integers(step_b)[0] = as_integer(path); + as_integers(step_b)[1] = s_rank + z_i; + } + } else { + // sample randomly across the path + graph.get_step_count(path); + std::uniform_int_distribution rando(0, graph.get_step_count(path)-1); + as_integers(step_b)[0] = as_integer(path); + as_integers(step_b)[1] = rando(gen); + } + + // and the graph handles, which we need to record the update + // FIXME use ODGI here + handle_t term_i = path_index.get_handle_of_step(step_a); + handle_t term_j = path_index.get_handle_of_step(step_b); + + bool update_term_i = true; + bool update_term_j = true; + + + // Check which terms we actually have to update + if (target_sorting) { + if (target_nodes[graph.get_id(term_i) - 1]) { + update_term_i = false; + } + if (target_nodes[graph.get_id(term_j) - 1]) { + update_term_j = false; + } + } + if (!update_term_j && !update_term_i) { + // we also have to update the number of terms here, because else we will over sample and the sorting will take much longer + term_updates_local++; + if (progress) { + progress_meter->increment(1); + } + continue; + } + + // adjust the positions to the node starts + /// FIXME this should be coming from the bv + size_t pos_in_path_a = path_index.get_position_of_step(step_a); + /// FIXME we use ODGI to run along until we reach the step with the rank of b + size_t pos_in_path_b = path_index.get_position_of_step(step_b); +#ifdef debug_path_sgd + std::cerr << "1. pos in path " << pos_in_path_a << " " << pos_in_path_b << std::endl; +#endif + // assert(pos_in_path_a < path_index.get_path_length(path)); + // assert(pos_in_path_b < path_index.get_path_length(path)); + +#ifdef debug_path_sgd + std::cerr << "2. pos in path " << pos_in_path_a << " " << pos_in_path_b << std::endl; +#endif + // establish the term distance + double term_dist = std::abs( + static_cast(pos_in_path_a) - static_cast(pos_in_path_b)); + + if (term_dist == 0) { + continue; + // term_dist = 1e-9; + } +#ifdef eval_path_sgd + /// FIXME use ODGI + std::string path_name = path_index.get_path_name(path); + std::cerr << path_name << "\t" << pos_in_path_a << "\t" << pos_in_path_b << "\t" << term_dist << std::endl; +#endif + // assert(term_dist == zipf_int); +#ifdef debug_path_sgd + std::cerr << "term_dist: " << term_dist << std::endl; +#endif + double term_weight = 1.0 / term_dist; + + double w_ij = term_weight; +#ifdef debug_path_sgd + std::cerr << "w_ij = " << w_ij << std::endl; +#endif + double mu = eta.load() * w_ij; + if (mu > 1) { + mu = 1; + } + // actual distance in graph + double d_ij = term_dist; + // identities + uint64_t i = number_bool_packing::unpack_number(term_i); + uint64_t j = number_bool_packing::unpack_number(term_j); +#ifdef debug_path_sgd + #pragma omp critical (cerr) + std::cerr << "nodes are " << graph.get_id(term_i) << " and " << graph.get_id(term_j) << std::endl; +#endif + // distance == magnitude in our 1D situation + double dx = X[i].load() - X[j].load(); + if (dx == 0) { + dx = 1e-9; // avoid nan + } +#ifdef debug_path_sgd + #pragma omp critical (cerr) + std::cerr << "distance is " << dx << " but should be " << d_ij << std::endl; +#endif + //double mag = dx; //sqrt(dx*dx + dy*dy); + double mag = std::abs(dx); +#ifdef debug_path_sgd + std::cerr << "mu " << mu << " mag " << mag << " d_ij " << d_ij << std::endl; +#endif + // check distances for early stopping + double Delta = mu * (mag - d_ij) / 2; + // try until we succeed. risky. + double Delta_abs = std::abs(Delta); +#ifdef debug_path_sgd + #pragma omp critical (cerr) + std::cerr << "Delta_abs " << Delta_abs << std::endl; +#endif + while (Delta_abs > Delta_max.load()) { + Delta_max.store(Delta_abs); + } + // calculate update + double r = Delta / mag; + double r_x = r * dx; +#ifdef debug_path_sgd + #pragma omp critical (cerr) + std::cerr << "r_x is " << r_x << std::endl; +#endif + // update our positions (atomically) +#ifdef debug_path_sgd + std::cerr << "before X[i] " << X[i].load() << " X[j] " << X[j].load() << std::endl; +#endif + if (update_term_i) { + X[i].store(X[i].load() - r_x); + } + if (update_term_j) { + X[j].store(X[j].load() + r_x); + } +#ifdef debug_path_sgd + std::cerr << "after X[i] " << X[i].load() << " X[j] " << X[j].load() << std::endl; +#endif + term_updates_local++; + if (term_updates_local >= 1000) { + term_updates += term_updates_local; + term_updates_local = 0; + } + if (progress) { + progress_meter->increment(1); + } + } + } + }; + + auto snapshot_lambda = + [&](void) { + uint64_t iter = 0; + while (snapshot && work_todo.load()) { + if ((iter < iteration) && iteration != iter_max) { + //snapshot_in_progress.store(true); // will be released again by the snapshot thread + std::cerr << "[odgi::path_linear_sgd] snapshot thread: Taking snapshot!" << std::endl; + // create temp file + std::string snapshot_tmp_file = xp::temp_file::create("snapshot"); + // write to temp file + ofstream snapshot_stream; + snapshot_stream.open(snapshot_tmp_file); + for (auto &x : X) { + snapshot_stream << x << std::endl; + } + // push back the name of the temp file + snapshots.push_back(snapshot_tmp_file); + iter = iteration; + // std::cerr << "ITER: " << iter << std::endl; + snapshot_in_progress.store(false); + snapshot_progress[iter].store(true); + } + std::this_thread::sleep_for(1ms); + } + + }; + + std::thread checker(checker_lambda); + std::thread snapshot_thread(snapshot_lambda); + + std::vector workers; + workers.reserve(nthreads); + for (uint64_t t = 0; t < nthreads; ++t) { + workers.emplace_back(worker_lambda, t); + } + + for (uint64_t t = 0; t < nthreads; ++t) { + workers[t].join(); + } + + snapshot_thread.join(); + + checker.join(); + } + + if (progress) { + progress_meter->finish(); + } + + // drop out of atomic stuff... maybe not the best way to do this + std::vector X_final(X.size()); + uint64_t i = 0; + for (auto &x : X) { + X_final[i++] = x.load(); + } + return X_final; + } + + std::vector path_linear_sgd_schedule_ssi(const double &w_min, + const double &w_max, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const double &eps) { +#ifdef debug_schedule + std::cerr << "w_min: " << w_min << std::endl; + std::cerr << "w_max: " << w_max << std::endl; + std::cerr << "iter_max: " << iter_max << std::endl; + std::cerr << "eps: " << eps << std::endl; +#endif + double eta_max = 1.0 / w_min; + double eta_min = eps / w_max; + double lambda = log(eta_max / eta_min) / ((double) iter_max - 1); +#ifdef debug_schedule + std::cerr << "eta_max: " << eta_max << std::endl; + std::cerr << "eta_min: " << eta_min << std::endl; + std::cerr << "lambda: " << lambda << std::endl; +#endif + // initialize step sizes + std::vector etas; + etas.reserve(iter_max + 1); +#ifdef debug_schedule + std::cerr << "etas: "; +#endif + for (int64_t t = 0; t <= iter_max; t++) { + etas.push_back(eta_max * exp(-lambda * (abs(t - (int64_t) iter_with_max_learning_rate)))); +#ifdef debug_schedule + std::cerr << etas.back() << ", "; +#endif + } +#ifdef debug_schedule + std::cerr << std::endl; +#endif + return etas; + } + + std::vector path_linear_sgd_order_ssi(graph_t &graph, + const algorithms::step_index_t &sampled_step_index, + const std::vector &path_sgd_use_paths, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const uint64_t &min_term_updates, + const double &delta, + const double &eps, + const double &eta_max, + const double &theta, + const uint64_t &space, + const uint64_t &space_max, + const uint64_t &space_quantization_step, + const double &cooling_start, + const uint64_t &nthreads, + const bool &progress, + const std::string &seed, + const bool &snapshot, + const std::string &snapshot_prefix, + const bool &target_sorting, + std::vector& target_nodes) { + std::vector snapshots; + std::vector layout = path_linear_sgd_ssi(graph, + sampled_step_index, + path_sgd_use_paths, + iter_max, + iter_with_max_learning_rate, + min_term_updates, + delta, + eps, + eta_max, + theta, + space, + space_max, + space_quantization_step, + cooling_start, + nthreads, + progress, + snapshot, + snapshots, + target_sorting, + target_nodes); + // TODO move the following into its own function that we can reuse +#ifdef debug_components + std::cerr << "node count: " << graph.get_node_count() << std::endl; +#endif + // refine order by weakly connected components + std::vector> weak_components = algorithms::weakly_connected_components( + &graph); +#ifdef debug_components + std::cerr << "components count: " << weak_components.size() << std::endl; +#endif + std::vector> weak_component_order; + for (int i = 0; i < weak_components.size(); i++) { + auto &weak_component = weak_components[i]; + uint64_t id_sum = 0; + for (auto node_id : weak_component) { + id_sum += node_id; + } + double avg_id = id_sum / (double) weak_component.size(); + weak_component_order.push_back(std::make_pair(avg_id, i)); + } + std::sort(weak_component_order.begin(), weak_component_order.end()); + std::vector weak_component_id; // maps rank to "id" based on the orignial sorted order + weak_component_id.resize(weak_component_order.size()); + uint64_t component_id = 0; + for (auto &component_order : weak_component_order) { + weak_component_id[component_order.second] = component_id++; + } + std::vector weak_components_map; + weak_components_map.resize(graph.get_node_count()); + // reserve the space we need + for (int i = 0; i < weak_components.size(); i++) { + auto &weak_component = weak_components[i]; + // store for each node identifier to component start index + for (auto node_id : weak_component) { + weak_components_map[node_id - 1] = weak_component_id[i]; + } +#ifdef debug_components + std::cerr << "weak_component.size(): " << weak_component.size() << std::endl; + std::cerr << "component_index: " << i << std::endl; +#endif + } + weak_components_map.clear(); + if (snapshot) { + for (int j = 0; j < snapshots.size(); j++) { + std::string snapshot_file_name = snapshots[j]; + std::ifstream snapshot_instream(snapshot_file_name); + std::vector snapshot_layout; + std::string line; + while(std::getline(snapshot_instream, line)) { + snapshot_layout.push_back(std::stod(line)); + } + snapshot_instream.close(); + uint64_t i = 0; + std::vector snapshot_handle_layout; + graph.for_each_handle( + [&i, &snapshot_layout, &weak_components_map, &snapshot_handle_layout]( + const handle_t &handle) { + snapshot_handle_layout.push_back( + { + weak_components_map[number_bool_packing::unpack_number(handle)], + snapshot_layout[i++], + handle + }); + }); + // sort the graph layout by component, then pos, then handle rank + std::sort(snapshot_handle_layout.begin(), snapshot_handle_layout.end(), + [&](const algorithms::handle_layout_t &a, + const algorithms::handle_layout_t &b) { + return a.weak_component < b.weak_component + || (a.weak_component == b.weak_component + && a.pos < b.pos + || (a.pos == b.pos + && as_integer(a.handle) < as_integer(b.handle))); + }); + std::vector order; + order.reserve(graph.get_node_count()); + for (auto &layout_handle : snapshot_handle_layout) { + order.push_back(layout_handle.handle); + } + std::cerr << "[odgi::path_linear_sgd] Applying order to graph of snapshot: " << std::to_string(j + 1) + << std::endl; + std::string local_snapshot_prefix = snapshot_prefix + std::to_string(j + 1); + auto* graph_copy = new odgi::graph_t(); + utils::graph_deep_copy(graph, graph_copy); + graph_copy->apply_ordering(order, true); + ofstream f(local_snapshot_prefix); + std::cerr << "[odgi::path_linear_sgd] Writing snapshot: " << std::to_string(j + 1) << std::endl; + graph_copy->serialize(f); + f.close(); + } + } + std::vector handle_layout; + uint64_t i = 0; + graph.for_each_handle( + [&i, &layout, &weak_components_map, &handle_layout](const handle_t &handle) { + handle_layout.push_back( + { + weak_components_map[number_bool_packing::unpack_number(handle)], + layout[i++], + handle + }); + }); + // sort the graph layout by component, then pos, then handle rank + std::sort(handle_layout.begin(), handle_layout.end(), + [&](const algorithms::handle_layout_t &a, + const algorithms::handle_layout_t &b) { + return a.weak_component < b.weak_component + || (a.weak_component == b.weak_component + && a.pos < b.pos + || (a.pos == b.pos + && as_integer(a.handle) < as_integer(b.handle))); + }); + std::vector order; + order.reserve(graph.get_node_count()); + for (auto &layout_handle : handle_layout) { + order.push_back(layout_handle.handle); + } + return order; + } + } +} diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.hpp b/src/algorithms/pg_sgd/path_sgd_ssi.hpp new file mode 100644 index 00000000..bc8ed2c4 --- /dev/null +++ b/src/algorithms/pg_sgd/path_sgd_ssi.hpp @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../stepindex.hpp" +#include "../xp.hpp" // we still ned the xp for the tmp_file, maybe move this to utils? +#include "../sgd_term.hpp" +#include "IITree.h" +#include +#include +#include "../weakly_connected_components.hpp" +#include +#include "dirty_zipfian_int_distribution.h" +#include "XoshiroCpp.hpp" +#include "../progress.hpp" +#include "utils.hpp" +#include "path_sgd_helper.hpp" + +#include + +namespace odgi { + namespace algorithms { + + using namespace handlegraph; + +/// use SGD driven, by path guided, and partly zipfian distribution sampled pairwise distances to obtain a 1D linear layout of the graph that respects its topology +// FIXME make graph const again + std::vector path_linear_sgd_ssi(graph_t &graph, + const algorithms::step_index_t &sampled_step_index, + const std::vector& path_sgd_use_paths, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const uint64_t &min_term_updates, + const double &delta, + const double &eps, + const double &eta_max, + const double &theta, + const uint64_t &space, + const uint64_t &space_max, + const uint64_t &space_quantization_step, + const double &cooling_start, + const uint64_t &nthreads, + const bool &progress, + const bool &snapshot, + std::vector &snapshots); + +/// our learning schedule + std::vector path_linear_sgd_schedule_ssi(const double &w_min, + const double &w_max, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const double &eps); +// FIMXME make grpah const again + std::vector path_linear_sgd_order_ssi(graph_t &graph, + const algorithms::step_index_t &sampled_step_index, + const std::vector& path_sgd_use_paths, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const uint64_t &min_term_updates, + const double &delta, + const double &eps, + const double &eta_max, + const double &theta, + const uint64_t &space, + const uint64_t &space_max, + const uint64_t &space_quantization_step, + const double &cooling_start, + const uint64_t &nthreads, + const bool &progress, + const std::string &seed, + const bool &snapshot, + const std::string &snapshot_prefix, + const bool &target_sorting, + std::vector& target_nodes); + + } + +} \ No newline at end of file diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index b087e215..2beac211 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -13,6 +13,7 @@ #include "algorithms/xp.hpp" #include "algorithms/pg_sgd/path_sgd.hpp" #include "algorithms/pg_sgd/path_sgd_helper.hpp" +#include "algorithms/pg_sgd/path_sgd_ssi.hpp" #include "algorithms/groom.hpp" #include "algorithms/stepindex.hpp" #include "utils.hpp" @@ -390,28 +391,51 @@ int main_sort(int argc, char** argv) { sampled_step_index.from_handle_graph(graph, path_sgd_use_paths, num_threads, args::get(progress), ssi_sample_rate); } } - order = algorithms::path_linear_sgd_order(graph, - path_index, - path_sgd_use_paths, - path_sgd_iter_max, - path_sgd_iter_max_learning_rate, - path_sgd_min_term_updates, - path_sgd_delta, - path_sgd_eps, - path_sgd_max_eta, - path_sgd_zipf_theta, - path_sgd_zipf_space, - path_sgd_zipf_space_max, - path_sgd_zipf_space_quantization_step, - path_sgd_cooling, - num_threads, - progress, - path_sgd_seed, - snapshot, - snapshot_prefix, - _p_sgd_target_paths, - is_ref); - // FIXME SSI HERE + if (xp_way) { + order = algorithms::path_linear_sgd_order(graph, + path_index, + path_sgd_use_paths, + path_sgd_iter_max, + path_sgd_iter_max_learning_rate, + path_sgd_min_term_updates, + path_sgd_delta, + path_sgd_eps, + path_sgd_max_eta, + path_sgd_zipf_theta, + path_sgd_zipf_space, + path_sgd_zipf_space_max, + path_sgd_zipf_space_quantization_step, + path_sgd_cooling, + num_threads, + progress, + path_sgd_seed, + snapshot, + snapshot_prefix, + _p_sgd_target_paths, + is_ref); + } else { + order = algorithms::path_linear_sgd_order_ssi(graph, + sampled_step_index, + path_sgd_use_paths, + path_sgd_iter_max, + path_sgd_iter_max_learning_rate, + path_sgd_min_term_updates, + path_sgd_delta, + path_sgd_eps, + path_sgd_max_eta, + path_sgd_zipf_theta, + path_sgd_zipf_space, + path_sgd_zipf_space_max, + path_sgd_zipf_space_quantization_step, + path_sgd_cooling, + num_threads, + progress, + path_sgd_seed, + snapshot, + snapshot_prefix, + _p_sgd_target_paths, + is_ref); + } break; } case 'f': @@ -456,29 +480,52 @@ int main_sort(int argc, char** argv) { graph.apply_ordering(algorithms::topological_order(&graph, false, false, args::get(progress)), true); } else if (args::get(p_sgd)) { // FIXME SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow - std::vector order = - algorithms::path_linear_sgd_order(graph, - path_index, - path_sgd_use_paths, - path_sgd_iter_max, - path_sgd_iter_max_learning_rate, - path_sgd_min_term_updates, - path_sgd_delta, - path_sgd_eps, - path_sgd_max_eta, - path_sgd_zipf_theta, - path_sgd_zipf_space, - path_sgd_zipf_space_max, - path_sgd_zipf_space_quantization_step, - path_sgd_cooling, - num_threads, - progress, - path_sgd_seed, - snapshot, - snapshot_prefix, - _p_sgd_target_paths, - is_ref); - // FIXME SSI HERE + std::vector order; + if (xp_way) { + order = algorithms::path_linear_sgd_order(graph, + path_index, + path_sgd_use_paths, + path_sgd_iter_max, + path_sgd_iter_max_learning_rate, + path_sgd_min_term_updates, + path_sgd_delta, + path_sgd_eps, + path_sgd_max_eta, + path_sgd_zipf_theta, + path_sgd_zipf_space, + path_sgd_zipf_space_max, + path_sgd_zipf_space_quantization_step, + path_sgd_cooling, + num_threads, + progress, + path_sgd_seed, + snapshot, + snapshot_prefix, + _p_sgd_target_paths, + is_ref); + } else { + order = algorithms::path_linear_sgd_order_ssi(graph, + sampled_step_index, + path_sgd_use_paths, + path_sgd_iter_max, + path_sgd_iter_max_learning_rate, + path_sgd_min_term_updates, + path_sgd_delta, + path_sgd_eps, + path_sgd_max_eta, + path_sgd_zipf_theta, + path_sgd_zipf_space, + path_sgd_zipf_space_max, + path_sgd_zipf_space_quantization_step, + path_sgd_cooling, + num_threads, + progress, + path_sgd_seed, + snapshot, + snapshot_prefix, + _p_sgd_target_paths, + is_ref); + } graph.apply_ordering(order, true); } else if (args::get(breadth_first)) { graph.apply_ordering(algorithms::breadth_first_topological_order(graph, bf_chunk_size), true); From 985f13235a7c10c54af38e2a865cb87981f75963 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 27 Oct 2022 14:32:18 +0200 Subject: [PATCH 12/29] CMAKER --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 288ce106..ecdd817e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -594,6 +594,7 @@ add_library(odgi_objs OBJECT ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_layout.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_helper.cpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_ssi.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/draw.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/layout.cpp ${CMAKE_SOURCE_DIR}/src/algorithms/atomic_image.cpp @@ -763,6 +764,7 @@ set(odgi_HEADERS ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_layout.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_helper.hpp + ${CMAKE_SOURCE_DIR}/src/algorithms/pg_sgd/path_sgd_ssi.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/kmer.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/expand_context.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/id_ordered_paths.hpp From 8e27d5cef8a1e789e31eddf0a3ecb9f619c710b5 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 27 Oct 2022 15:57:30 +0200 Subject: [PATCH 13/29] node-step-bit-vector initialized --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 54 ++++++++++++++++++++++++-- src/algorithms/pg_sgd/path_sgd_ssi.hpp | 6 ++- src/subcommand/sort_main.cpp | 10 +++-- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 7d6cd36a..c11d29b8 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -26,7 +26,8 @@ namespace odgi { const bool &snapshot, std::vector &snapshots, const bool &target_sorting, - std::vector& target_nodes) { + std::vector& target_nodes, + const uint64_t &sum_path_step_count) { #ifdef debug_path_sgd std::cerr << "iter_max: " << iter_max << std::endl; std::cerr << "min_term_updates: " << min_term_updates << std::endl; @@ -42,6 +43,51 @@ namespace odgi { uint64_t first_cooling_iteration = std::floor(cooling_start * (double)iter_max); //std::cerr << "first cooling iteration " << first_cooling_iteration << std::endl; + /* + // FIXME this is for testing only + sdsl::bit_vector s_bv; + sdsl::util::assign(s_bv, sdsl::bit_vector(10)); + s_bv[0] = 1; + s_bv[3] = 1; + s_bv[5] = 1; + s_bv[8] = 1; + for (auto bitty : s_bv) { + std::cerr << bitty << std::endl; + } + sdsl::rank_support_v<1> s_bv_rank; + sdsl::bit_vector::select_1_type s_bv_select; + sdsl::util::assign(s_bv_rank, sdsl::rank_support_v<1>(&s_bv)); + sdsl::util::assign(s_bv_select, sdsl::bit_vector::select_1_type(&s_bv)); + std::cerr << "assignment complete" << std::endl; + uint64_t k = 6; + uint64_t k_rank = s_bv_rank(k); + std::cerr << "rank at 6: " << k_rank << std::endl; + uint64_t k_rank_to_select = s_bv_select(k_rank); + std::cerr << "select at rank " << k_rank << ": " << k_rank_to_select << std::endl; + uint64_t step_on_node = k - k_rank_to_select - 1; // 0-based! + std::cerr << "step on node 0-based: " << step_on_node << std::endl; + */ + + /// initialize the bitvector for random sampling + /// each time we see a node we add 1, and each time we see a step we keep it 0 + sdsl::bit_vector ns_bv; // node-step bitvector + sdsl::util::assign(ns_bv, graph.get_node_count() + sum_path_step_count); + uint64_t l = 0; + graph.for_each_handle( + [&](const handle_t &h) { + ns_bv[l] = 1; + graph.for_each_step_on_handle( + h, + [&](const step_handle_t& step) { + l++; + }); + l++; + }); + sdsl::rank_support_v<1> ns_bv_rank; + sdsl::bit_vector::select_1_type ns_bv_select; + sdsl::util::assign(ns_bv_rank, sdsl::rank_support_v<1>(&ns_bv)); + sdsl::util::assign(ns_bv_select, sdsl::bit_vector::select_1_type(&ns_bv)); + /// FIXME this is just so that it runs xp::XP path_index; path_index.from_handle_graph(graph, nthreads); @@ -543,7 +589,8 @@ namespace odgi { const bool &snapshot, const std::string &snapshot_prefix, const bool &target_sorting, - std::vector& target_nodes) { + std::vector& target_nodes, + const uint64_t &sum_path_step_count) { std::vector snapshots; std::vector layout = path_linear_sgd_ssi(graph, sampled_step_index, @@ -564,7 +611,8 @@ namespace odgi { snapshot, snapshots, target_sorting, - target_nodes); + target_nodes, + sum_path_step_count); // TODO move the following into its own function that we can reuse #ifdef debug_components std::cerr << "node count: " << graph.get_node_count() << std::endl; diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.hpp b/src/algorithms/pg_sgd/path_sgd_ssi.hpp index bc8ed2c4..6f9d7e1d 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.hpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.hpp @@ -50,7 +50,8 @@ namespace odgi { const uint64_t &nthreads, const bool &progress, const bool &snapshot, - std::vector &snapshots); + std::vector &snapshots, + const uint64_t &sum_path_step_count); /// our learning schedule std::vector path_linear_sgd_schedule_ssi(const double &w_min, @@ -79,7 +80,8 @@ namespace odgi { const bool &snapshot, const std::string &snapshot_prefix, const bool &target_sorting, - std::vector& target_nodes); + std::vector& target_nodes, + const uint64_t &sum_path_step_count); } diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 2beac211..35e0c495 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -270,6 +270,7 @@ int main_sort(int argc, char** argv) { std::vector is_ref; std::vector target_paths; uint64_t ssi_sample_rate = 8; + uint64_t sum_path_step_count; if (p_sgd || args::get(pipeline).find('Y') != std::string::npos) { if (_p_sgd_target_paths) { target_paths = utils::load_paths(args::get(_p_sgd_target_paths), graph, "sort"); @@ -286,6 +287,7 @@ int main_sort(int argc, char** argv) { path_index.from_handle_graph(graph, num_threads); // do we load the SSI from file? } else if (ssi_in_file) { + // FIXME we could also only allow an integer as input to build an ssi for the given sample rate sampled_step_index.load(args::get(ssi_in_file)); // store the SSI sampling rate ssi_sample_rate = sampled_step_index.get_sample_rate(); @@ -296,7 +298,7 @@ int main_sort(int argc, char** argv) { } fresh_step_index = true; - uint64_t sum_path_step_count = algorithms::get_sum_path_step_count(path_sgd_use_paths, graph); + sum_path_step_count = algorithms::get_sum_path_step_count(path_sgd_use_paths, graph); if (args::get(p_sgd_min_term_updates_paths)) { path_sgd_min_term_updates = args::get(p_sgd_min_term_updates_paths) * sum_path_step_count; } else { @@ -434,7 +436,8 @@ int main_sort(int argc, char** argv) { snapshot, snapshot_prefix, _p_sgd_target_paths, - is_ref); + is_ref, + sum_path_step_count); } break; } @@ -524,7 +527,8 @@ int main_sort(int argc, char** argv) { snapshot, snapshot_prefix, _p_sgd_target_paths, - is_ref); + is_ref, + sum_path_step_count); } graph.apply_ordering(order, true); } else if (args::get(breadth_first)) { From af6db6f72263abfc17cffcc8eec9cf2a9cb603eb Mon Sep 17 00:00:00 2001 From: subwaystation Date: Fri, 28 Oct 2022 11:46:35 +0200 Subject: [PATCH 14/29] ns_bv in place --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 52 +++++++++++++++++++++----- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index c11d29b8..0bd5c290 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -66,13 +66,24 @@ namespace odgi { std::cerr << "select at rank " << k_rank << ": " << k_rank_to_select << std::endl; uint64_t step_on_node = k - k_rank_to_select - 1; // 0-based! std::cerr << "step on node 0-based: " << step_on_node << std::endl; - */ + // this case will be skipped anyhow + k = 5; + k_rank = s_bv_rank(k); + std::cerr << "rank at 6: " << k_rank << std::endl; + k_rank_to_select = s_bv_select(k_rank); + std::cerr << "select at rank " << k_rank << ": " << k_rank_to_select << std::endl; + step_on_node = k - k_rank_to_select - 1; // 0-based! + std::cerr << "step on node 0-based: " << step_on_node << std::endl; + */ + exit(0); /// initialize the bitvector for random sampling /// each time we see a node we add 1, and each time we see a step we keep it 0 sdsl::bit_vector ns_bv; // node-step bitvector sdsl::util::assign(ns_bv, graph.get_node_count() + sum_path_step_count); uint64_t l = 0; + /// FIXME merge this with the iteration below so we don't iterate twice across all nodes!!!! + /// FIXME add progress bar graph.for_each_handle( [&](const handle_t &h) { ns_bv[l] = 1; @@ -109,6 +120,7 @@ namespace odgi { snapshot_progress[0].store(true); // seed them with the graph order uint64_t len = 0; + /// FIXME we can merge this iteration with the generation of the ns_bv graph.for_each_handle( [&X, &graph, &len](const handle_t &handle) { // nb: we assume that the graph provides a compact handle set @@ -270,7 +282,8 @@ namespace odgi { const sdsl::int_vector<> &nr_iv = path_index.get_nr_iv(); const sdsl::int_vector<> &npi_iv = path_index.get_npi_iv(); // we'll sample from all path steps - std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, np_bv.size() - 1); + // we have a larger search space because of ns_bv + node_count + std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, ns_bv.size() + graph.get_node_count() - 1); std::uniform_int_distribution flip(0, 1); uint64_t term_updates_local = 0; while (work_todo.load()) { @@ -281,16 +294,23 @@ namespace odgi { #ifdef debug_sample_from_nodes std::cerr << "step_index: " << step_index << std::endl; #endif - // FIXME we should use a bv here with rank and select + // if ns_bv[step_index] == 1 we have to continue! + if (ns_bv[step_index] == 1) { + continue; + } + uint64_t handle_rank_of_step_index = ns_bv_rank(step_index); + step_index = step_index - handle_rank_of_step_index; + uint64_t step_rank_on_node_of_step_index = step_index - ns_bv_select(handle_rank_of_step_index) - 1; + + + // FIXME get the actual step_index for testing uint64_t path_i = npi_iv[step_index]; path_handle_t path = as_path_handle(path_i); - // FIXME replace with ODGI - size_t path_step_count = path_index.get_path_step_count(path); + size_t path_step_count = graph.get_step_count(path); if (path_step_count == 1){ continue; } - #ifdef debug_sample_from_nodes std::cerr << "path integer: " << path_i << std::endl; #endif @@ -341,9 +361,22 @@ namespace odgi { } // and the graph handles, which we need to record the update - // FIXME use ODGI here + // FIXME + + /// xp: + //as_integers(step)[0] // path_id + //as_integers(step)[1] // handle_rank + /// ODGI: + //as_integers(step)[0] // handle_rank + //as_integers(step)[1] // path_id handle_t term_i = path_index.get_handle_of_step(step_a); + // std::cerr << "term_i" << std::endl; + // std::cerr << graph.get_id(term_i) << std::endl; + // std::cerr << graph.get_id(graph.get_handle_of_step(step_a)) << std::endl; + // std::cerr << "term_j" << std::endl; handle_t term_j = path_index.get_handle_of_step(step_b); + //std::cerr << graph.get_id(term_j) << std::endl; + //std::cerr << graph.get_id(graph.get_handle_of_step(step_b)) << std::endl; bool update_term_i = true; bool update_term_j = true; @@ -390,9 +423,8 @@ namespace odgi { // term_dist = 1e-9; } #ifdef eval_path_sgd - /// FIXME use ODGI - std::string path_name = path_index.get_path_name(path); - std::cerr << path_name << "\t" << pos_in_path_a << "\t" << pos_in_path_b << "\t" << term_dist << std::endl; + std::string path_name = graph.get_path_name(path); + std::cerr << path_name << "\t" << pos_in_path_a << "\t" << pos_in_path_b << "\t" << term_dist << std::endl; #endif // assert(term_dist == zipf_int); #ifdef debug_path_sgd From cf9631a0e9a6f3f45a6dbb484dbcefd3301fc97c Mon Sep 17 00:00:00 2001 From: subwaystation Date: Fri, 28 Oct 2022 11:55:32 +0200 Subject: [PATCH 15/29] bug fix --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 0bd5c290..6393952c 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -76,12 +76,16 @@ namespace odgi { std::cerr << "step on node 0-based: " << step_on_node << std::endl; */ - exit(0); /// initialize the bitvector for random sampling /// each time we see a node we add 1, and each time we see a step we keep it 0 sdsl::bit_vector ns_bv; // node-step bitvector sdsl::util::assign(ns_bv, graph.get_node_count() + sum_path_step_count); + /// to be super save we also record the orientation of each step + std::vector handle_orientation; + handle_orientation.reserve(sum_path_step_count); uint64_t l = 0; + uint64_t m = 0; + /// FIXME merge this with the iteration below so we don't iterate twice across all nodes!!!! /// FIXME add progress bar graph.for_each_handle( @@ -91,6 +95,8 @@ namespace odgi { h, [&](const step_handle_t& step) { l++; + handle_orientation[m] = graph.get_is_reverse(graph.get_handle_of_step(step)); + m++; }); l++; }); @@ -283,7 +289,8 @@ namespace odgi { const sdsl::int_vector<> &npi_iv = path_index.get_npi_iv(); // we'll sample from all path steps // we have a larger search space because of ns_bv + node_count - std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, ns_bv.size() + graph.get_node_count() - 1); + // std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, ns_bv.size() - 1); + std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, np_bv.size() - 1); std::uniform_int_distribution flip(0, 1); uint64_t term_updates_local = 0; while (work_todo.load()) { @@ -298,10 +305,11 @@ namespace odgi { if (ns_bv[step_index] == 1) { continue; } + /* FIXME uint64_t handle_rank_of_step_index = ns_bv_rank(step_index); step_index = step_index - handle_rank_of_step_index; uint64_t step_rank_on_node_of_step_index = step_index - ns_bv_select(handle_rank_of_step_index) - 1; - + */ // FIXME get the actual step_index for testing uint64_t path_i = npi_iv[step_index]; From fad31c76eade5f6317cb68f354c37e695b00bb6a Mon Sep 17 00:00:00 2001 From: subwaystation Date: Fri, 28 Oct 2022 11:58:21 +0200 Subject: [PATCH 16/29] happy --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 6393952c..ff0461d2 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -289,8 +289,8 @@ namespace odgi { const sdsl::int_vector<> &npi_iv = path_index.get_npi_iv(); // we'll sample from all path steps // we have a larger search space because of ns_bv + node_count - // std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, ns_bv.size() - 1); - std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, np_bv.size() - 1); + std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, ns_bv.size() - 1); + // std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, np_bv.size() - 1); std::uniform_int_distribution flip(0, 1); uint64_t term_updates_local = 0; while (work_todo.load()) { @@ -305,11 +305,10 @@ namespace odgi { if (ns_bv[step_index] == 1) { continue; } - /* FIXME + /// FIXME uint64_t handle_rank_of_step_index = ns_bv_rank(step_index); step_index = step_index - handle_rank_of_step_index; uint64_t step_rank_on_node_of_step_index = step_index - ns_bv_select(handle_rank_of_step_index) - 1; - */ // FIXME get the actual step_index for testing uint64_t path_i = npi_iv[step_index]; From 588800c3b09e8ae196f98c986c03d19b38314d34 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Fri, 28 Oct 2022 14:21:41 +0200 Subject: [PATCH 17/29] pos of 1st step is retrieved via ssi! --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 50 +++++++++++++++----------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index ff0461d2..fe2b5e12 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -297,22 +297,36 @@ namespace odgi { if (!snapshot_in_progress.load()) { // sample the first node from all the nodes in the graph // pick a random position from all paths - uint64_t step_index = dis_step(gen); + uint64_t node_step_index = dis_step(gen); #ifdef debug_sample_from_nodes - std::cerr << "step_index: " << step_index << std::endl; + std::cerr << "step_index: " << node_step_index << std::endl; #endif - // if ns_bv[step_index] == 1 we have to continue! - if (ns_bv[step_index] == 1) { + // if ns_bv[step_index] == 1 we have to continue, because we hit a node! + if (ns_bv[node_step_index] == 1) { continue; } /// FIXME - uint64_t handle_rank_of_step_index = ns_bv_rank(step_index); - step_index = step_index - handle_rank_of_step_index; - uint64_t step_rank_on_node_of_step_index = step_index - ns_bv_select(handle_rank_of_step_index) - 1; + uint64_t handle_rank_of_step_index = ns_bv_rank(node_step_index); // because we have a compacted graph, this is the actual node id! + uint64_t step_index = node_step_index - handle_rank_of_step_index; + uint64_t step_rank_on_node_given_step_index = node_step_index - ns_bv_select(handle_rank_of_step_index) - 1; + // I want to have the handle + handle_t term_i_ssi = graph.get_handle(handle_rank_of_step_index, handle_orientation[step_index]); + // I want to have the step + uint64_t step_rank_on_handle = 0; + step_handle_t step_a_ssi; + graph.for_each_step_on_handle( + term_i_ssi, + [&](const step_handle_t& step) { + if (step_rank_on_handle == step_rank_on_node_given_step_index) { + step_a_ssi = step; + } + step_rank_on_handle++; + }); - // FIXME get the actual step_index for testing uint64_t path_i = npi_iv[step_index]; - path_handle_t path = as_path_handle(path_i); + // FIXME delete this later + // path_handle_t path = as_path_handle(path_i); + path_handle_t path = graph.get_path_handle_of_step(step_a_ssi); size_t path_step_count = graph.get_step_count(path); if (path_step_count == 1){ @@ -323,7 +337,6 @@ namespace odgi { #endif step_handle_t step_a, step_b; as_integers(step_a)[0] = path_i; // path index - // FIXME use the bv here size_t s_rank = nr_iv[step_index] - 1; // step rank in path as_integers(step_a)[1] = s_rank; #ifdef debug_sample_from_nodes @@ -361,7 +374,6 @@ namespace odgi { } } else { // sample randomly across the path - graph.get_step_count(path); std::uniform_int_distribution rando(0, graph.get_step_count(path)-1); as_integers(step_b)[0] = as_integer(path); as_integers(step_b)[1] = rando(gen); @@ -372,18 +384,13 @@ namespace odgi { /// xp: //as_integers(step)[0] // path_id - //as_integers(step)[1] // handle_rank + //as_integers(step)[1] // handle_rank or step_rank? /// ODGI: //as_integers(step)[0] // handle_rank //as_integers(step)[1] // path_id - handle_t term_i = path_index.get_handle_of_step(step_a); - // std::cerr << "term_i" << std::endl; - // std::cerr << graph.get_id(term_i) << std::endl; - // std::cerr << graph.get_id(graph.get_handle_of_step(step_a)) << std::endl; - // std::cerr << "term_j" << std::endl; + // handle_t term_i = path_index.get_handle_of_step(step_a); + handle_t term_i = term_i_ssi; handle_t term_j = path_index.get_handle_of_step(step_b); - //std::cerr << graph.get_id(term_j) << std::endl; - //std::cerr << graph.get_id(graph.get_handle_of_step(step_b)) << std::endl; bool update_term_i = true; bool update_term_j = true; @@ -408,8 +415,9 @@ namespace odgi { } // adjust the positions to the node starts - /// FIXME this should be coming from the bv - size_t pos_in_path_a = path_index.get_position_of_step(step_a); + /// FIXME remove this later + // size_t pos_in_path_a = path_index.get_position_of_step(step_a); + size_t pos_in_path_a = sampled_step_index.get_position(step_a_ssi, graph); /// FIXME we use ODGI to run along until we reach the step with the rank of b size_t pos_in_path_b = path_index.get_position_of_step(step_b); #ifdef debug_path_sgd From aea0b911095778160828940d7d6c1d71d883e97e Mon Sep 17 00:00:00 2001 From: subwaystation Date: Fri, 28 Oct 2022 16:07:01 +0200 Subject: [PATCH 18/29] pos of 2nd step via ssi when we do uniform sampling --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 38 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index fe2b5e12..c5b5908e 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -313,11 +313,12 @@ namespace odgi { handle_t term_i_ssi = graph.get_handle(handle_rank_of_step_index, handle_orientation[step_index]); // I want to have the step uint64_t step_rank_on_handle = 0; - step_handle_t step_a_ssi; + step_handle_t step_a_ssi, step_b_ssi; graph.for_each_step_on_handle( term_i_ssi, [&](const step_handle_t& step) { if (step_rank_on_handle == step_rank_on_node_given_step_index) { + // FIXME once we found this, step out of it step_a_ssi = step; } step_rank_on_handle++; @@ -342,8 +343,9 @@ namespace odgi { #ifdef debug_sample_from_nodes std::cerr << "step rank in path: " << nr_iv[step_index] << std::endl; #endif - + bool cool = false; if (cooling.load() || flip(gen)) { + cool = true; auto _theta = adj_theta.load(); if (s_rank > 0 && flip(gen) || s_rank == path_step_count-1) { // go backward @@ -375,8 +377,23 @@ namespace odgi { } else { // sample randomly across the path std::uniform_int_distribution rando(0, graph.get_step_count(path)-1); + // FIXME very inefficient solution! + uint64_t step_rank_b = rando(gen); + uint64_t n = 0; + step_handle_t cur_step = graph.path_begin(path); + bool lost = true; + // has_next_step is very, very costly! + while (lost) { + if (n == step_rank_b) { + cur_step = graph.get_next_step(cur_step); + lost = false; + } + n++; + } + step_b_ssi = cur_step; + as_integers(step_b)[0] = as_integer(path); - as_integers(step_b)[1] = rando(gen); + as_integers(step_b)[1] = step_rank_b; } // and the graph handles, which we need to record the update @@ -389,8 +406,14 @@ namespace odgi { //as_integers(step)[0] // handle_rank //as_integers(step)[1] // path_id // handle_t term_i = path_index.get_handle_of_step(step_a); + handle_t term_i = term_i_ssi; - handle_t term_j = path_index.get_handle_of_step(step_b); + handle_t term_j; + if (cool) { + term_j = path_index.get_handle_of_step(step_b); + } else { + term_j = graph.get_handle_of_step(step_b_ssi); + } bool update_term_i = true; bool update_term_j = true; @@ -419,7 +442,12 @@ namespace odgi { // size_t pos_in_path_a = path_index.get_position_of_step(step_a); size_t pos_in_path_a = sampled_step_index.get_position(step_a_ssi, graph); /// FIXME we use ODGI to run along until we reach the step with the rank of b - size_t pos_in_path_b = path_index.get_position_of_step(step_b); + size_t pos_in_path_b; + if (cool) { + pos_in_path_b = path_index.get_position_of_step(step_b); + } else { + pos_in_path_b = sampled_step_index.get_position(step_b_ssi, graph); + } #ifdef debug_path_sgd std::cerr << "1. pos in path " << pos_in_path_a << " " << pos_in_path_b << std::endl; #endif From e58388586cd6f1bafacbb998f7e01e370cf2cec3 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 31 Oct 2022 11:51:04 +0100 Subject: [PATCH 19/29] some more tweaks --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 82 ++++++++++++++------------ 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index c5b5908e..08dd1176 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -81,16 +81,22 @@ namespace odgi { sdsl::bit_vector ns_bv; // node-step bitvector sdsl::util::assign(ns_bv, graph.get_node_count() + sum_path_step_count); /// to be super save we also record the orientation of each step + uint64_t len = 0; + uint64_t num_nodes = graph.get_node_count(); + // our positions in 1D + std::vector> X(num_nodes); std::vector handle_orientation; handle_orientation.reserve(sum_path_step_count); uint64_t l = 0; uint64_t m = 0; - /// FIXME merge this with the iteration below so we don't iterate twice across all nodes!!!! - /// FIXME add progress bar + // add progress bar here? graph.for_each_handle( [&](const handle_t &h) { ns_bv[l] = 1; + // nb: we assume that the graph provides a compact handle set + X[number_bool_packing::unpack_number(h)].store(len); + len += graph.get_length(h); graph.for_each_step_on_handle( h, [&](const step_handle_t& step) { @@ -116,33 +122,11 @@ namespace odgi { total_term_updates, "[odgi::path_linear_sgd] 1D path-guided SGD:"); } using namespace std::chrono_literals; // for timing stuff - uint64_t num_nodes = graph.get_node_count(); - // our positions in 1D - std::vector> X(num_nodes); atomic snapshot_in_progress; snapshot_in_progress.store(false); std::vector> snapshot_progress(iter_max); // we will produce one less snapshot compared to iterations snapshot_progress[0].store(true); - // seed them with the graph order - uint64_t len = 0; - /// FIXME we can merge this iteration with the generation of the ns_bv - graph.for_each_handle( - [&X, &graph, &len](const handle_t &handle) { - // nb: we assume that the graph provides a compact handle set - X[number_bool_packing::unpack_number(handle)].store(len); - len += graph.get_length(handle); - }); - // the longest path length measured in nucleotides - //size_t longest_path_in_nucleotides = 0; - // the total path length in nucleotides - //size_t total_path_len_in_nucleotides = 0; - // here we store all path nucleotides lengths so we know later from which path we sampled our random position from - //IITree path_nucleotide_tree; - // iterate over all relevant path_handles: - // 1. build the interval tree - // 2. find out the longest path in nucleotides and store this number size_t - // 3. add the current path length to the total length bool at_least_one_path_with_more_than_one_step = false; for (auto &path : path_sgd_use_paths) { @@ -155,19 +139,11 @@ namespace odgi { size_t path_len = path_index.get_path_length(path); std::cerr << path_name << " has length: " << path_len << std::endl; #endif - //path_nucleotide_tree.add(total_path_len_in_nucleotides, total_path_len_in_nucleotides + path_len, path); - - //if (path_len > longest_path_in_nucleotides) { - // longest_path_in_nucleotides = path_len; - //} - //total_path_len_in_nucleotides += path_len; - if (path_index.get_path_step_count(path) > 1){ at_least_one_path_with_more_than_one_step = true; break; } } - //path_nucleotide_tree.index(); if (at_least_one_path_with_more_than_one_step){ double w_min = (double) 1.0 / (double) (eta_max); @@ -194,7 +170,6 @@ namespace odgi { std::vector zetas((space <= space_max ? space : space_max + (space - space_max) / space_quantization_step + 1)+1); uint64_t last_quantized_i = 0; - // FIXME add progress bar here? #pragma omp parallel for schedule(static,1) for (uint64_t i = 1; i < space+1; ++i) { uint64_t quantized_i = i; @@ -284,7 +259,7 @@ namespace odgi { XoshiroCpp::Xoshiro256Plus gen(seed); // a nice, fast PRNG // FIXME these should be replaced by the bv and ssi // some references to literal bitvectors in the path index hmmm - const sdsl::bit_vector &np_bv = path_index.get_np_bv(); + // const sdsl::bit_vector &np_bv = path_index.get_np_bv(); const sdsl::int_vector<> &nr_iv = path_index.get_nr_iv(); const sdsl::int_vector<> &npi_iv = path_index.get_npi_iv(); // we'll sample from all path steps @@ -305,7 +280,6 @@ namespace odgi { if (ns_bv[node_step_index] == 1) { continue; } - /// FIXME uint64_t handle_rank_of_step_index = ns_bv_rank(node_step_index); // because we have a compacted graph, this is the actual node id! uint64_t step_index = node_step_index - handle_rank_of_step_index; uint64_t step_rank_on_node_given_step_index = node_step_index - ns_bv_select(handle_rank_of_step_index) - 1; @@ -318,8 +292,10 @@ namespace odgi { term_i_ssi, [&](const step_handle_t& step) { if (step_rank_on_handle == step_rank_on_node_given_step_index) { - // FIXME once we found this, step out of it step_a_ssi = step; + // FIXME once we found this, step out of it + // I am not sure what is happening here, but this return seems to abort the mission too early + // return; } step_rank_on_handle++; }); @@ -347,9 +323,35 @@ namespace odgi { if (cooling.load() || flip(gen)) { cool = true; auto _theta = adj_theta.load(); + + /// IN PROGRESS + bool path_end = false; + // FIXME building up alternative solution + while (!path_end) { + // go backward + if (flip(gen)) { + uint64_t local_space; + // FIXME we need to limit the jump length by the given path length? + if (space > space_max){ + local_space = space_max + (space - space_max) / space_quantization_step + 1; + } + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, space, _theta, zetas[local_space]); + dirtyzipf::dirty_zipfian_int_distribution z(z_p); + uint64_t z_i = z(gen); + path_end = true; + // go forward + } else { + + } + } + /// IN PROGRESS END + + // if I understand correctly, we can replace [s_rank == path_step_count - 1] with graph.has_next_step(step_a)? + // BUT TRY TO COMPARE AGAINST THE LAST STEP, BECAUSE HAS NEXT_STEP IS SLOW if (s_rank > 0 && flip(gen) || s_rank == path_step_count-1) { + // FIXME why do we check if the step_rank is larger 0? // go backward - uint64_t jump_space = std::min(space, s_rank); + uint64_t jump_space = std::min(space, s_rank); // replacing s_rank with space uint64_t space = jump_space; if (jump_space > space_max){ space = space_max + (jump_space - space_max) / space_quantization_step + 1; @@ -358,6 +360,7 @@ namespace odgi { dirtyzipf::dirty_zipfian_int_distribution z(z_p); uint64_t z_i = z(gen); //assert(z_i <= path_space); + // FIXME travel there :) as_integers(step_b)[0] = as_integer(path); as_integers(step_b)[1] = s_rank - z_i; } else { @@ -377,7 +380,7 @@ namespace odgi { } else { // sample randomly across the path std::uniform_int_distribution rando(0, graph.get_step_count(path)-1); - // FIXME very inefficient solution! + // FIXME very inefficient solution! Is there a more random way?! uint64_t step_rank_b = rando(gen); uint64_t n = 0; step_handle_t cur_step = graph.path_begin(path); @@ -401,7 +404,7 @@ namespace odgi { /// xp: //as_integers(step)[0] // path_id - //as_integers(step)[1] // handle_rank or step_rank? + //as_integers(step)[1] // step_rank? /// ODGI: //as_integers(step)[0] // handle_rank //as_integers(step)[1] // path_id @@ -411,6 +414,7 @@ namespace odgi { handle_t term_j; if (cool) { term_j = path_index.get_handle_of_step(step_b); + // FIXME laters! } else { term_j = graph.get_handle_of_step(step_b_ssi); } From 64d6dfd1e6e386235b5d6c061e0029ffed6e6a57 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 31 Oct 2022 14:54:22 +0100 Subject: [PATCH 20/29] hmm --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 68 ++++++++++++++++++++------ 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 08dd1176..3ae3835f 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -326,24 +326,54 @@ namespace odgi { /// IN PROGRESS bool path_end = false; + const uint64_t path_steps = sampled_step_index.get_path_len(path); + uint64_t local_space; + // FIXME we need to limit the jump length by the given path length? + if (path_steps > space_max){ + local_space = space_max + (path_steps - space_max) / space_quantization_step + 1; + } + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps, _theta, zetas[local_space]); + dirtyzipf::dirty_zipfian_int_distribution z(z_p); + uint64_t z_i = z(gen); + uint64_t steps_travelled = 0; + step_handle_t last_step_in_path = graph.path_back(path); + step_handle_t first_step_in_path = graph.path_begin(path); + step_handle_t cur_step = step_a_ssi; + // FIXME building up alternative solution - while (!path_end) { - // go backward - if (flip(gen)) { - uint64_t local_space; - // FIXME we need to limit the jump length by the given path length? - if (space > space_max){ - local_space = space_max + (space - space_max) / space_quantization_step + 1; + if (flip(gen)) { + // go backwards + while (!path_end) { + // did we reach the left end already? + if ((as_integers(cur_step)[0] == as_integers(first_step_in_path)[0]) && + (as_integers(cur_step)[1] == as_integers(first_step_in_path)[1])) { + cur_step = graph.path_back(path); + } else { + cur_step = graph.get_previous_step(cur_step); + } + steps_travelled++; + // this assumes z_i is at least 1; + if (z_i == steps_travelled) { + path_end = true; + } + } + // go forward + } else { + while (!path_end) { + // did we reach the right already? + if ((as_integers(cur_step)[0] == as_integers(last_step_in_path)[0]) && + (as_integers(cur_step)[1] == as_integers(last_step_in_path)[1])) { + cur_step = graph.path_begin(path); + } else { + cur_step = graph.get_next_step(cur_step); + } + steps_travelled++; + if (z_i == steps_travelled) { + path_end = true; } - dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, space, _theta, zetas[local_space]); - dirtyzipf::dirty_zipfian_int_distribution z(z_p); - uint64_t z_i = z(gen); - path_end = true; - // go forward - } else { - } } + step_b_ssi = cur_step; /// IN PROGRESS END // if I understand correctly, we can replace [s_rank == path_step_count - 1] with graph.has_next_step(step_a)? @@ -446,12 +476,14 @@ namespace odgi { // size_t pos_in_path_a = path_index.get_position_of_step(step_a); size_t pos_in_path_a = sampled_step_index.get_position(step_a_ssi, graph); /// FIXME we use ODGI to run along until we reach the step with the rank of b - size_t pos_in_path_b; + size_t pos_in_path_b = sampled_step_index.get_position(step_b_ssi, graph); + /* if (cool) { pos_in_path_b = path_index.get_position_of_step(step_b); } else { pos_in_path_b = sampled_step_index.get_position(step_b_ssi, graph); } + */ #ifdef debug_path_sgd std::cerr << "1. pos in path " << pos_in_path_a << " " << pos_in_path_b << std::endl; #endif @@ -697,6 +729,8 @@ namespace odgi { std::cerr << "node count: " << graph.get_node_count() << std::endl; #endif // refine order by weakly connected components + + // prepare weakly connected components std::vector> weak_components = algorithms::weakly_connected_components( &graph); #ifdef debug_components @@ -713,7 +747,7 @@ namespace odgi { weak_component_order.push_back(std::make_pair(avg_id, i)); } std::sort(weak_component_order.begin(), weak_component_order.end()); - std::vector weak_component_id; // maps rank to "id" based on the orignial sorted order + std::vector weak_component_id; // maps rank to "id" based on the original sorted order weak_component_id.resize(weak_component_order.size()); uint64_t component_id = 0; for (auto &component_order : weak_component_order) { @@ -734,6 +768,7 @@ namespace odgi { #endif } weak_components_map.clear(); + // generate and write snapshot graphs if (snapshot) { for (int j = 0; j < snapshots.size(); j++) { std::string snapshot_file_name = snapshots[j]; @@ -783,6 +818,7 @@ namespace odgi { f.close(); } } + // from layout to order std::vector handle_layout; uint64_t i = 0; graph.for_each_handle( From 2cd5023e31531360ed083c2ef6315d932c47c08b Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 31 Oct 2022 15:31:56 +0100 Subject: [PATCH 21/29] working somehow --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 3ae3835f..97de2897 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -329,18 +329,20 @@ namespace odgi { const uint64_t path_steps = sampled_step_index.get_path_len(path); uint64_t local_space; // FIXME we need to limit the jump length by the given path length? + // FIXME I think the problem is how the zetas are created?! if (path_steps > space_max){ local_space = space_max + (path_steps - space_max) / space_quantization_step + 1; } - dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps, _theta, zetas[local_space]); + //dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps, _theta, zetas[local_space]); + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps/1000, _theta); dirtyzipf::dirty_zipfian_int_distribution z(z_p); uint64_t z_i = z(gen); + uint64_t steps_travelled = 0; step_handle_t last_step_in_path = graph.path_back(path); step_handle_t first_step_in_path = graph.path_begin(path); step_handle_t cur_step = step_a_ssi; - // FIXME building up alternative solution if (flip(gen)) { // go backwards while (!path_end) { @@ -381,7 +383,7 @@ namespace odgi { if (s_rank > 0 && flip(gen) || s_rank == path_step_count-1) { // FIXME why do we check if the step_rank is larger 0? // go backward - uint64_t jump_space = std::min(space, s_rank); // replacing s_rank with space + uint64_t jump_space = std::min(space, s_rank); uint64_t space = jump_space; if (jump_space > space_max){ space = space_max + (jump_space - space_max) / space_quantization_step + 1; @@ -448,6 +450,7 @@ namespace odgi { } else { term_j = graph.get_handle_of_step(step_b_ssi); } + term_j = graph.get_handle_of_step(step_b_ssi); bool update_term_i = true; bool update_term_j = true; From 7e192b3aeea25061f1ba0f0b131f007990170d95 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 31 Oct 2022 15:52:52 +0100 Subject: [PATCH 22/29] cleanup --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 113 +++---------------------- 1 file changed, 14 insertions(+), 99 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 97de2897..a00e5ceb 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -43,8 +43,9 @@ namespace odgi { uint64_t first_cooling_iteration = std::floor(cooling_start * (double)iter_max); //std::cerr << "first cooling iteration " << first_cooling_iteration << std::endl; - /* + // FIXME this is for testing only +#ifdef debug_tests sdsl::bit_vector s_bv; sdsl::util::assign(s_bv, sdsl::bit_vector(10)); s_bv[0] = 1; @@ -74,7 +75,7 @@ namespace odgi { std::cerr << "select at rank " << k_rank << ": " << k_rank_to_select << std::endl; step_on_node = k - k_rank_to_select - 1; // 0-based! std::cerr << "step on node 0-based: " << step_on_node << std::endl; - */ +#endif /// initialize the bitvector for random sampling /// each time we see a node we add 1, and each time we see a step we keep it 0 @@ -111,10 +112,6 @@ namespace odgi { sdsl::util::assign(ns_bv_rank, sdsl::rank_support_v<1>(&ns_bv)); sdsl::util::assign(ns_bv_select, sdsl::bit_vector::select_1_type(&ns_bv)); - /// FIXME this is just so that it runs - xp::XP path_index; - path_index.from_handle_graph(graph, nthreads); - uint64_t total_term_updates = iter_max * min_term_updates; std::unique_ptr progress_meter; if (progress) { @@ -139,7 +136,7 @@ namespace odgi { size_t path_len = path_index.get_path_length(path); std::cerr << path_name << " has length: " << path_len << std::endl; #endif - if (path_index.get_path_step_count(path) > 1){ + if (graph.get_step_count(path) > 1){ at_least_one_path_with_more_than_one_step = true; break; } @@ -164,6 +161,7 @@ namespace odgi { eps); // cache zipf zetas for our full path space (heavy, but one-off) + // FIXME we need to adjust this for the new stuff, maybe for each path? if (progress) { std::cerr << "[odgi::path_linear_sgd] calculating zetas for " << (space <= space_max ? space : space_max + (space - space_max) / space_quantization_step + 1) << " zipf distributions" << std::endl; } @@ -257,11 +255,6 @@ namespace odgi { // everyone tries to seed with their own random data const std::uint64_t seed = 9399220 + tid; XoshiroCpp::Xoshiro256Plus gen(seed); // a nice, fast PRNG - // FIXME these should be replaced by the bv and ssi - // some references to literal bitvectors in the path index hmmm - // const sdsl::bit_vector &np_bv = path_index.get_np_bv(); - const sdsl::int_vector<> &nr_iv = path_index.get_nr_iv(); - const sdsl::int_vector<> &npi_iv = path_index.get_npi_iv(); // we'll sample from all path steps // we have a larger search space because of ns_bv + node_count std::uniform_int_distribution dis_step = std::uniform_int_distribution(0, ns_bv.size() - 1); @@ -300,40 +293,24 @@ namespace odgi { step_rank_on_handle++; }); - uint64_t path_i = npi_iv[step_index]; - // FIXME delete this later - // path_handle_t path = as_path_handle(path_i); path_handle_t path = graph.get_path_handle_of_step(step_a_ssi); - size_t path_step_count = graph.get_step_count(path); if (path_step_count == 1){ continue; } -#ifdef debug_sample_from_nodes - std::cerr << "path integer: " << path_i << std::endl; -#endif - step_handle_t step_a, step_b; - as_integers(step_a)[0] = path_i; // path index - size_t s_rank = nr_iv[step_index] - 1; // step rank in path - as_integers(step_a)[1] = s_rank; -#ifdef debug_sample_from_nodes - std::cerr << "step rank in path: " << nr_iv[step_index] << std::endl; -#endif - bool cool = false; + if (cooling.load() || flip(gen)) { - cool = true; auto _theta = adj_theta.load(); - - /// IN PROGRESS bool path_end = false; const uint64_t path_steps = sampled_step_index.get_path_len(path); uint64_t local_space; // FIXME we need to limit the jump length by the given path length? // FIXME I think the problem is how the zetas are created?! + // FIXME @Andrea we need to fix this somehow if (path_steps > space_max){ local_space = space_max + (path_steps - space_max) / space_quantization_step + 1; } - //dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps, _theta, zetas[local_space]); + //dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps, _theta, zetas[local_space]); // FIXME @Andrea dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps/1000, _theta); dirtyzipf::dirty_zipfian_int_distribution z(z_p); uint64_t z_i = z(gen); @@ -344,7 +321,8 @@ namespace odgi { step_handle_t cur_step = step_a_ssi; if (flip(gen)) { - // go backwards + // go backward + // FIXME if we have to go backward more steps than half the total number of steps, we should actually go forward, because we will have less steps to travel while (!path_end) { // did we reach the left end already? if ((as_integers(cur_step)[0] == as_integers(first_step_in_path)[0]) && @@ -361,6 +339,7 @@ namespace odgi { } // go forward } else { + // FIXME if we have to go forward more steps than halft the total number of steps, we should actually go backward, because we will have less steps to travel while (!path_end) { // did we reach the right already? if ((as_integers(cur_step)[0] == as_integers(last_step_in_path)[0]) && @@ -376,39 +355,6 @@ namespace odgi { } } step_b_ssi = cur_step; - /// IN PROGRESS END - - // if I understand correctly, we can replace [s_rank == path_step_count - 1] with graph.has_next_step(step_a)? - // BUT TRY TO COMPARE AGAINST THE LAST STEP, BECAUSE HAS NEXT_STEP IS SLOW - if (s_rank > 0 && flip(gen) || s_rank == path_step_count-1) { - // FIXME why do we check if the step_rank is larger 0? - // go backward - uint64_t jump_space = std::min(space, s_rank); - uint64_t space = jump_space; - if (jump_space > space_max){ - space = space_max + (jump_space - space_max) / space_quantization_step + 1; - } - dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, jump_space, _theta, zetas[space]); - dirtyzipf::dirty_zipfian_int_distribution z(z_p); - uint64_t z_i = z(gen); - //assert(z_i <= path_space); - // FIXME travel there :) - as_integers(step_b)[0] = as_integer(path); - as_integers(step_b)[1] = s_rank - z_i; - } else { - // go forward - uint64_t jump_space = std::min(space, path_step_count - s_rank - 1); - uint64_t space = jump_space; - if (jump_space > space_max){ - space = space_max + (jump_space - space_max) / space_quantization_step + 1; - } - dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, jump_space, _theta, zetas[space]); - dirtyzipf::dirty_zipfian_int_distribution z(z_p); - uint64_t z_i = z(gen); - //assert(z_i <= path_space); - as_integers(step_b)[0] = as_integer(path); - as_integers(step_b)[1] = s_rank + z_i; - } } else { // sample randomly across the path std::uniform_int_distribution rando(0, graph.get_step_count(path)-1); @@ -426,31 +372,10 @@ namespace odgi { n++; } step_b_ssi = cur_step; - - as_integers(step_b)[0] = as_integer(path); - as_integers(step_b)[1] = step_rank_b; } - // and the graph handles, which we need to record the update - // FIXME - - /// xp: - //as_integers(step)[0] // path_id - //as_integers(step)[1] // step_rank? - /// ODGI: - //as_integers(step)[0] // handle_rank - //as_integers(step)[1] // path_id - // handle_t term_i = path_index.get_handle_of_step(step_a); - handle_t term_i = term_i_ssi; - handle_t term_j; - if (cool) { - term_j = path_index.get_handle_of_step(step_b); - // FIXME laters! - } else { - term_j = graph.get_handle_of_step(step_b_ssi); - } - term_j = graph.get_handle_of_step(step_b_ssi); + handle_t term_j = graph.get_handle_of_step(step_b_ssi); bool update_term_i = true; bool update_term_j = true; @@ -475,23 +400,13 @@ namespace odgi { } // adjust the positions to the node starts - /// FIXME remove this later - // size_t pos_in_path_a = path_index.get_position_of_step(step_a); size_t pos_in_path_a = sampled_step_index.get_position(step_a_ssi, graph); - /// FIXME we use ODGI to run along until we reach the step with the rank of b size_t pos_in_path_b = sampled_step_index.get_position(step_b_ssi, graph); - /* - if (cool) { - pos_in_path_b = path_index.get_position_of_step(step_b); - } else { - pos_in_path_b = sampled_step_index.get_position(step_b_ssi, graph); - } - */ #ifdef debug_path_sgd std::cerr << "1. pos in path " << pos_in_path_a << " " << pos_in_path_b << std::endl; #endif - // assert(pos_in_path_a < path_index.get_path_length(path)); - // assert(pos_in_path_b < path_index.get_path_length(path)); + // assert(pos_in_path_a < sampled_step_index.get_path_len(path)); + // assert(pos_in_path_b < sampled_step_index.get_path_len(path)); #ifdef debug_path_sgd std::cerr << "2. pos in path " << pos_in_path_a << " " << pos_in_path_b << std::endl; From c824efe8f373f3fd0e949e399ab4479e6d18fc0a Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 2 Nov 2022 11:50:26 +0100 Subject: [PATCH 23/29] playing around --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 24 +++++++++++++----------- src/subcommand/sort_main.cpp | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index a00e5ceb..8004a92e 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -44,7 +44,7 @@ namespace odgi { //std::cerr << "first cooling iteration " << first_cooling_iteration << std::endl; - // FIXME this is for testing only + // this is for testing only #ifdef debug_tests sdsl::bit_vector s_bv; sdsl::util::assign(s_bv, sdsl::bit_vector(10)); @@ -302,16 +302,15 @@ namespace odgi { if (cooling.load() || flip(gen)) { auto _theta = adj_theta.load(); bool path_end = false; - const uint64_t path_steps = sampled_step_index.get_path_len(path); uint64_t local_space; // FIXME we need to limit the jump length by the given path length? // FIXME I think the problem is how the zetas are created?! // FIXME @Andrea we need to fix this somehow - if (path_steps > space_max){ - local_space = space_max + (path_steps - space_max) / space_quantization_step + 1; + if (path_step_count > space_max){ + local_space = space_max + (path_step_count - space_max) / space_quantization_step + 1; } - //dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps, _theta, zetas[local_space]); // FIXME @Andrea - dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_steps/1000, _theta); + //dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_step_count, _theta, zetas[local_space]); // FIXME @Andrea + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_step_count/100, _theta); // 100 for DRB1-3123 data set dirtyzipf::dirty_zipfian_int_distribution z(z_p); uint64_t z_i = z(gen); @@ -322,7 +321,6 @@ namespace odgi { if (flip(gen)) { // go backward - // FIXME if we have to go backward more steps than half the total number of steps, we should actually go forward, because we will have less steps to travel while (!path_end) { // did we reach the left end already? if ((as_integers(cur_step)[0] == as_integers(first_step_in_path)[0]) && @@ -339,7 +337,6 @@ namespace odgi { } // go forward } else { - // FIXME if we have to go forward more steps than halft the total number of steps, we should actually go backward, because we will have less steps to travel while (!path_end) { // did we reach the right already? if ((as_integers(cur_step)[0] == as_integers(last_step_in_path)[0]) && @@ -349,6 +346,7 @@ namespace odgi { cur_step = graph.get_next_step(cur_step); } steps_travelled++; + // this assumes z_i is at least 1; if (z_i == steps_travelled) { path_end = true; } @@ -361,16 +359,20 @@ namespace odgi { // FIXME very inefficient solution! Is there a more random way?! uint64_t step_rank_b = rando(gen); uint64_t n = 0; - step_handle_t cur_step = graph.path_begin(path); + step_handle_t cur_step; + cur_step = graph.path_begin(path); bool lost = true; - // has_next_step is very, very costly! + // has_next_step is very, very costly and already unfeasable for yeast! + // 10 minutes versus 15 seconds! + /* while (lost) { if (n == step_rank_b) { - cur_step = graph.get_next_step(cur_step); lost = false; } + cur_step = graph.get_next_step(cur_step); n++; } + */ step_b_ssi = cur_step; } diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 35e0c495..32ebb39d 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -154,7 +154,7 @@ int main_sort(int argc, char** argv) { return 1; } - const bool xp_way = xp_in_file || p_sgd_xp; + const bool xp_way = ((xp_in_file || p_sgd_xp) && p_sgd) ; if (xp_way && ssi_in_file) { std::cerr From 7327513b8cdb5519b12f13aa6dfb9f3cb2da66e2 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 2 Nov 2022 12:51:39 +0100 Subject: [PATCH 24/29] better mirror initial implementation --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 8004a92e..405ec449 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -319,7 +319,7 @@ namespace odgi { step_handle_t first_step_in_path = graph.path_begin(path); step_handle_t cur_step = step_a_ssi; - if (flip(gen)) { + if (flip(gen)) { // if (s_rank > 0 && flip(gen) || s_rank == path_step_count-1) { WHAT TO DO HERE? // go backward while (!path_end) { // did we reach the left end already? @@ -364,7 +364,6 @@ namespace odgi { bool lost = true; // has_next_step is very, very costly and already unfeasable for yeast! // 10 minutes versus 15 seconds! - /* while (lost) { if (n == step_rank_b) { lost = false; @@ -372,7 +371,6 @@ namespace odgi { cur_step = graph.get_next_step(cur_step); n++; } - */ step_b_ssi = cur_step; } From b675cad65b53c3b983198a0515d1aa7df1cd8ccf Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 2 Nov 2022 13:12:48 +0100 Subject: [PATCH 25/29] cleanup path_linear_sgd_schedule --- src/algorithms/pg_sgd/path_sgd.cpp | 39 +---------------------- src/algorithms/pg_sgd/path_sgd_helper.cpp | 37 +++++++++++++++++++++ src/algorithms/pg_sgd/path_sgd_helper.hpp | 8 +++++ src/algorithms/pg_sgd/path_sgd_ssi.cpp | 39 +---------------------- 4 files changed, 47 insertions(+), 76 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd.cpp b/src/algorithms/pg_sgd/path_sgd.cpp index 3e5a09f7..0c2c2d62 100644 --- a/src/algorithms/pg_sgd/path_sgd.cpp +++ b/src/algorithms/pg_sgd/path_sgd.cpp @@ -114,7 +114,7 @@ namespace odgi { std::cerr << "[odgi::path_linear_sgd] calculating linear SGD schedule (" << w_min << " " << w_max << " " << iter_max << " " << iter_with_max_learning_rate << " " << eps << ")" << std::endl; } - std::vector etas = path_linear_sgd_schedule(w_min, + std::vector etas = algorithms::path_linear_sgd_schedule(w_min, w_max, iter_max, iter_with_max_learning_rate, @@ -474,43 +474,6 @@ namespace odgi { return X_final; } - std::vector path_linear_sgd_schedule(const double &w_min, - const double &w_max, - const uint64_t &iter_max, - const uint64_t &iter_with_max_learning_rate, - const double &eps) { -#ifdef debug_schedule - std::cerr << "w_min: " << w_min << std::endl; - std::cerr << "w_max: " << w_max << std::endl; - std::cerr << "iter_max: " << iter_max << std::endl; - std::cerr << "eps: " << eps << std::endl; -#endif - double eta_max = 1.0 / w_min; - double eta_min = eps / w_max; - double lambda = log(eta_max / eta_min) / ((double) iter_max - 1); -#ifdef debug_schedule - std::cerr << "eta_max: " << eta_max << std::endl; - std::cerr << "eta_min: " << eta_min << std::endl; - std::cerr << "lambda: " << lambda << std::endl; -#endif - // initialize step sizes - std::vector etas; - etas.reserve(iter_max + 1); -#ifdef debug_schedule - std::cerr << "etas: "; -#endif - for (int64_t t = 0; t <= iter_max; t++) { - etas.push_back(eta_max * exp(-lambda * (abs(t - (int64_t) iter_with_max_learning_rate)))); -#ifdef debug_schedule - std::cerr << etas.back() << ", "; -#endif - } -#ifdef debug_schedule - std::cerr << std::endl; -#endif - return etas; - } - std::vector path_linear_sgd_order(const graph_t &graph, const xp::XP &path_index, const std::vector &path_sgd_use_paths, diff --git a/src/algorithms/pg_sgd/path_sgd_helper.cpp b/src/algorithms/pg_sgd/path_sgd_helper.cpp index bb597308..8dc53314 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.cpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.cpp @@ -79,5 +79,42 @@ namespace odgi { std::fill_n(is_ref.begin(), ref_nodes, true); std::fill(is_ref.begin() + ref_nodes, is_ref.end(), false); } + + std::vector path_linear_sgd_schedule(const double &w_min, + const double &w_max, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const double &eps) { +#ifdef debug_schedule + std::cerr << "w_min: " << w_min << std::endl; + std::cerr << "w_max: " << w_max << std::endl; + std::cerr << "iter_max: " << iter_max << std::endl; + std::cerr << "eps: " << eps << std::endl; +#endif + double eta_max = 1.0 / w_min; + double eta_min = eps / w_max; + double lambda = log(eta_max / eta_min) / ((double) iter_max - 1); +#ifdef debug_schedule + std::cerr << "eta_max: " << eta_max << std::endl; + std::cerr << "eta_min: " << eta_min << std::endl; + std::cerr << "lambda: " << lambda << std::endl; +#endif + // initialize step sizes + std::vector etas; + etas.reserve(iter_max + 1); +#ifdef debug_schedule + std::cerr << "etas: "; +#endif + for (int64_t t = 0; t <= iter_max; t++) { + etas.push_back(eta_max * exp(-lambda * (abs(t - (int64_t) iter_with_max_learning_rate)))); +#ifdef debug_schedule + std::cerr << etas.back() << ", "; +#endif + } +#ifdef debug_schedule + std::cerr << std::endl; +#endif + return etas; + } } } diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp index 5c76dcf2..041a055d 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.hpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -31,8 +31,16 @@ namespace odgi { const uint64_t get_max_path_length_ssi(const std::vector &path_sgd_use_paths, const algorithms::step_index_t &sampled_step_index); + /// sort the graph's nodes by given target paths, we need this for the reference based sorting const void sort_graph_by_target_paths(graph_t& graph, const std::vector &target_paths, std::vector& is_ref, const bool &progress); + + /// calculate the schedule + std::vector path_linear_sgd_schedule(const double &w_min, + const double &w_max, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const double &eps); } } diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 405ec449..08a925ca 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -154,7 +154,7 @@ namespace odgi { std::cerr << "[odgi::path_linear_sgd] calculating linear SGD schedule (" << w_min << " " << w_max << " " << iter_max << " " << iter_with_max_learning_rate << " " << eps << ")" << std::endl; } - std::vector etas = path_linear_sgd_schedule_ssi(w_min, + std::vector etas = algorithms::path_linear_sgd_schedule(w_min, w_max, iter_max, iter_with_max_learning_rate, @@ -561,43 +561,6 @@ namespace odgi { return X_final; } - std::vector path_linear_sgd_schedule_ssi(const double &w_min, - const double &w_max, - const uint64_t &iter_max, - const uint64_t &iter_with_max_learning_rate, - const double &eps) { -#ifdef debug_schedule - std::cerr << "w_min: " << w_min << std::endl; - std::cerr << "w_max: " << w_max << std::endl; - std::cerr << "iter_max: " << iter_max << std::endl; - std::cerr << "eps: " << eps << std::endl; -#endif - double eta_max = 1.0 / w_min; - double eta_min = eps / w_max; - double lambda = log(eta_max / eta_min) / ((double) iter_max - 1); -#ifdef debug_schedule - std::cerr << "eta_max: " << eta_max << std::endl; - std::cerr << "eta_min: " << eta_min << std::endl; - std::cerr << "lambda: " << lambda << std::endl; -#endif - // initialize step sizes - std::vector etas; - etas.reserve(iter_max + 1); -#ifdef debug_schedule - std::cerr << "etas: "; -#endif - for (int64_t t = 0; t <= iter_max; t++) { - etas.push_back(eta_max * exp(-lambda * (abs(t - (int64_t) iter_with_max_learning_rate)))); -#ifdef debug_schedule - std::cerr << etas.back() << ", "; -#endif - } -#ifdef debug_schedule - std::cerr << std::endl; -#endif - return etas; - } - std::vector path_linear_sgd_order_ssi(graph_t &graph, const algorithms::step_index_t &sampled_step_index, const std::vector &path_sgd_use_paths, From 91c9f34a04a6a0f1917c9aacc90887843992af9e Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 3 Nov 2022 14:28:34 +0100 Subject: [PATCH 26/29] refactor prepare_weak_connected_components_map --- src/algorithms/pg_sgd/path_sgd.cpp | 50 +++----------------- src/algorithms/pg_sgd/path_sgd.hpp | 4 +- src/algorithms/pg_sgd/path_sgd_helper.cpp | 46 +++++++++++++++++++ src/algorithms/pg_sgd/path_sgd_helper.hpp | 7 ++- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 56 ++++------------------- 5 files changed, 69 insertions(+), 94 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd.cpp b/src/algorithms/pg_sgd/path_sgd.cpp index 0c2c2d62..331764a3 100644 --- a/src/algorithms/pg_sgd/path_sgd.cpp +++ b/src/algorithms/pg_sgd/path_sgd.cpp @@ -8,7 +8,7 @@ namespace odgi { namespace algorithms { - std::vector path_linear_sgd(const graph_t &graph, + std::vector path_linear_sgd(graph_t &graph, const xp::XP &path_index, const std::vector &path_sgd_use_paths, const uint64_t &iter_max, @@ -474,7 +474,7 @@ namespace odgi { return X_final; } - std::vector path_linear_sgd_order(const graph_t &graph, + std::vector path_linear_sgd_order(graph_t &graph, const xp::XP &path_index, const std::vector &path_sgd_use_paths, const uint64_t &iter_max, @@ -516,48 +516,10 @@ namespace odgi { snapshots, target_sorting, target_nodes); - // TODO move the following into its own function that we can reuse -#ifdef debug_components - std::cerr << "node count: " << graph.get_node_count() << std::endl; -#endif - // refine order by weakly connected components - std::vector> weak_components = algorithms::weakly_connected_components( - &graph); -#ifdef debug_components - std::cerr << "components count: " << weak_components.size() << std::endl; -#endif - std::vector> weak_component_order; - for (int i = 0; i < weak_components.size(); i++) { - auto &weak_component = weak_components[i]; - uint64_t id_sum = 0; - for (auto node_id : weak_component) { - id_sum += node_id; - } - double avg_id = id_sum / (double) weak_component.size(); - weak_component_order.push_back(std::make_pair(avg_id, i)); - } - std::sort(weak_component_order.begin(), weak_component_order.end()); - std::vector weak_component_id; // maps rank to "id" based on the orignial sorted order - weak_component_id.resize(weak_component_order.size()); - uint64_t component_id = 0; - for (auto &component_order : weak_component_order) { - weak_component_id[component_order.second] = component_id++; - } - std::vector weak_components_map; - weak_components_map.resize(graph.get_node_count()); - // reserve the space we need - for (int i = 0; i < weak_components.size(); i++) { - auto &weak_component = weak_components[i]; - // store for each node identifier to component start index - for (auto node_id : weak_component) { - weak_components_map[node_id - 1] = weak_component_id[i]; - } -#ifdef debug_components - std::cerr << "weak_component.size(): " << weak_component.size() << std::endl; - std::cerr << "component_index: " << i << std::endl; -#endif - } - weak_components_map.clear(); + // prepare the weak components map + std::vector weak_components_map; + algorithms::prepare_weak_connected_components_map(graph, weak_components_map); + // prepare and write snapshots if (snapshot) { for (int j = 0; j < snapshots.size(); j++) { std::string snapshot_file_name = snapshots[j]; diff --git a/src/algorithms/pg_sgd/path_sgd.hpp b/src/algorithms/pg_sgd/path_sgd.hpp index c8cbc25f..e3c91d4e 100644 --- a/src/algorithms/pg_sgd/path_sgd.hpp +++ b/src/algorithms/pg_sgd/path_sgd.hpp @@ -32,7 +32,7 @@ namespace algorithms { using namespace handlegraph; /// use SGD driven, by path guided, and partly zipfian distribution sampled pairwise distances to obtain a 1D linear layout of the graph that respects its topology -std::vector path_linear_sgd(const graph_t &graph, +std::vector path_linear_sgd(graph_t &graph, const xp::XP &path_index, const std::vector& path_sgd_use_paths, const uint64_t &iter_max, @@ -58,7 +58,7 @@ std::vector path_linear_sgd_schedule(const double &w_min, const uint64_t &iter_with_max_learning_rate, const double &eps); -std::vector path_linear_sgd_order(const graph_t &graph, +std::vector path_linear_sgd_order(graph_t &graph, const xp::XP &path_index, const std::vector& path_sgd_use_paths, const uint64_t &iter_max, diff --git a/src/algorithms/pg_sgd/path_sgd_helper.cpp b/src/algorithms/pg_sgd/path_sgd_helper.cpp index 8dc53314..483e4b3f 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.cpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.cpp @@ -116,5 +116,51 @@ namespace odgi { #endif return etas; } + + const void prepare_weak_connected_components_map(graph_t& graph, + std::vector& weak_components_map) { +#ifdef debug_components + std::cerr << "node count: " << graph.get_node_count() << std::endl; +#endif + // refine order by weakly connected components + + // prepare weakly connected components + std::vector> weak_components = algorithms::weakly_connected_components( + &graph); +#ifdef debug_components + std::cerr << "components count: " << weak_components.size() << std::endl; +#endif + std::vector> weak_component_order; + for (int i = 0; i < weak_components.size(); i++) { + auto &weak_component = weak_components[i]; + uint64_t id_sum = 0; + for (auto node_id : weak_component) { + id_sum += node_id; + } + double avg_id = id_sum / (double) weak_component.size(); + weak_component_order.push_back(std::make_pair(avg_id, i)); + } + std::sort(weak_component_order.begin(), weak_component_order.end()); + std::vector weak_component_id; // maps rank to "id" based on the original sorted order + weak_component_id.resize(weak_component_order.size()); + uint64_t component_id = 0; + for (auto &component_order : weak_component_order) { + weak_component_id[component_order.second] = component_id++; + } + weak_components_map.resize(graph.get_node_count()); + // reserve the space we need + for (int i = 0; i < weak_components.size(); i++) { + auto &weak_component = weak_components[i]; + // store for each node identifier to component start index + for (auto node_id : weak_component) { + weak_components_map[node_id - 1] = weak_component_id[i]; + } +#ifdef debug_components + std::cerr << "weak_component.size(): " << weak_component.size() << std::endl; + std::cerr << "component_index: " << i << std::endl; +#endif + } + weak_components_map.clear(); + } } } diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp index 041a055d..5f92c96a 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.hpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -3,6 +3,7 @@ #include "odgi.hpp" #include "xp.hpp" #include "algorithms/stepindex.hpp" +#include "../weakly_connected_components.hpp" namespace odgi { namespace algorithms { @@ -36,11 +37,15 @@ namespace odgi { std::vector& is_ref, const bool &progress); - /// calculate the schedule + /// calculate the SGD schedule std::vector path_linear_sgd_schedule(const double &w_min, const double &w_max, const uint64_t &iter_max, const uint64_t &iter_with_max_learning_rate, const double &eps); + + /// prepare the weak components map which we will need later so we separate all graph components in 1D + const void prepare_weak_connected_components_map(graph_t& graph, + std::vector& weak_components_map); } } diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 08a925ca..335a3bb6 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -302,14 +302,17 @@ namespace odgi { if (cooling.load() || flip(gen)) { auto _theta = adj_theta.load(); bool path_end = false; - uint64_t local_space; // FIXME we need to limit the jump length by the given path length? // FIXME I think the problem is how the zetas are created?! // FIXME @Andrea we need to fix this somehow - if (path_step_count > space_max){ - local_space = space_max + (path_step_count - space_max) / space_quantization_step + 1; + /* + uint64_t jump_space = std::min(space, s_rank); + uint64_t space = jump_space; + if (jump_space > space_max){ + space = space_max + (jump_space - space_max) / space_quantization_step + 1; } - //dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_step_count, _theta, zetas[local_space]); // FIXME @Andrea + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, jump_space, _theta, zetas[space]); FIXME @Andrea + */ dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_step_count/100, _theta); // 100 for DRB1-3123 data set dirtyzipf::dirty_zipfian_int_distribution z(z_p); uint64_t z_i = z(gen); @@ -605,50 +608,9 @@ namespace odgi { target_sorting, target_nodes, sum_path_step_count); - // TODO move the following into its own function that we can reuse -#ifdef debug_components - std::cerr << "node count: " << graph.get_node_count() << std::endl; -#endif - // refine order by weakly connected components - - // prepare weakly connected components - std::vector> weak_components = algorithms::weakly_connected_components( - &graph); -#ifdef debug_components - std::cerr << "components count: " << weak_components.size() << std::endl; -#endif - std::vector> weak_component_order; - for (int i = 0; i < weak_components.size(); i++) { - auto &weak_component = weak_components[i]; - uint64_t id_sum = 0; - for (auto node_id : weak_component) { - id_sum += node_id; - } - double avg_id = id_sum / (double) weak_component.size(); - weak_component_order.push_back(std::make_pair(avg_id, i)); - } - std::sort(weak_component_order.begin(), weak_component_order.end()); - std::vector weak_component_id; // maps rank to "id" based on the original sorted order - weak_component_id.resize(weak_component_order.size()); - uint64_t component_id = 0; - for (auto &component_order : weak_component_order) { - weak_component_id[component_order.second] = component_id++; - } + // prepare the weak components map std::vector weak_components_map; - weak_components_map.resize(graph.get_node_count()); - // reserve the space we need - for (int i = 0; i < weak_components.size(); i++) { - auto &weak_component = weak_components[i]; - // store for each node identifier to component start index - for (auto node_id : weak_component) { - weak_components_map[node_id - 1] = weak_component_id[i]; - } -#ifdef debug_components - std::cerr << "weak_component.size(): " << weak_component.size() << std::endl; - std::cerr << "component_index: " << i << std::endl; -#endif - } - weak_components_map.clear(); + algorithms::prepare_weak_connected_components_map(graph, weak_components_map); // generate and write snapshot graphs if (snapshot) { for (int j = 0; j < snapshots.size(); j++) { From 925683a935dd2a9d23e9352925f99a3c3d70682e Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 3 Nov 2022 14:43:04 +0100 Subject: [PATCH 27/29] refactor generate_and_write_snapshot_graphs --- src/algorithms/pg_sgd/path_sgd.cpp | 57 +++-------------------- src/algorithms/pg_sgd/path_sgd_helper.cpp | 54 +++++++++++++++++++++ src/algorithms/pg_sgd/path_sgd_helper.hpp | 8 +++- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 51 ++------------------ src/algorithms/pg_sgd/path_sgd_ssi.hpp | 2 +- 5 files changed, 73 insertions(+), 99 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd.cpp b/src/algorithms/pg_sgd/path_sgd.cpp index 331764a3..2da86cb8 100644 --- a/src/algorithms/pg_sgd/path_sgd.cpp +++ b/src/algorithms/pg_sgd/path_sgd.cpp @@ -520,56 +520,13 @@ namespace odgi { std::vector weak_components_map; algorithms::prepare_weak_connected_components_map(graph, weak_components_map); // prepare and write snapshots - if (snapshot) { - for (int j = 0; j < snapshots.size(); j++) { - std::string snapshot_file_name = snapshots[j]; - std::ifstream snapshot_instream(snapshot_file_name); - std::vector snapshot_layout; - std::string line; - while(std::getline(snapshot_instream, line)) { - snapshot_layout.push_back(std::stod(line)); - } - snapshot_instream.close(); - uint64_t i = 0; - std::vector snapshot_handle_layout; - graph.for_each_handle( - [&i, &snapshot_layout, &weak_components_map, &snapshot_handle_layout]( - const handle_t &handle) { - snapshot_handle_layout.push_back( - { - weak_components_map[number_bool_packing::unpack_number(handle)], - snapshot_layout[i++], - handle - }); - }); - // sort the graph layout by component, then pos, then handle rank - std::sort(snapshot_handle_layout.begin(), snapshot_handle_layout.end(), - [&](const handle_layout_t &a, - const handle_layout_t &b) { - return a.weak_component < b.weak_component - || (a.weak_component == b.weak_component - && a.pos < b.pos - || (a.pos == b.pos - && as_integer(a.handle) < as_integer(b.handle))); - }); - std::vector order; - order.reserve(graph.get_node_count()); - for (auto &layout_handle : snapshot_handle_layout) { - order.push_back(layout_handle.handle); - } - std::cerr << "[odgi::path_linear_sgd] Applying order to graph of snapshot: " << std::to_string(j + 1) - << std::endl; - std::string local_snapshot_prefix = snapshot_prefix + std::to_string(j + 1); - auto* graph_copy = new odgi::graph_t(); - utils::graph_deep_copy(graph, graph_copy); - graph_copy->apply_ordering(order, true); - ofstream f(local_snapshot_prefix); - std::cerr << "[odgi::path_linear_sgd] Writing snapshot: " << std::to_string(j + 1) << std::endl; - graph_copy->serialize(f); - f.close(); - } - } - std::vector handle_layout; + if (snapshot) { + generate_and_write_snapshot_graphs(graph, + weak_components_map, + snapshot_prefix, + snapshots); + } + std::vector handle_layout; uint64_t i = 0; graph.for_each_handle( [&i, &layout, &weak_components_map, &handle_layout](const handle_t &handle) { diff --git a/src/algorithms/pg_sgd/path_sgd_helper.cpp b/src/algorithms/pg_sgd/path_sgd_helper.cpp index 483e4b3f..9e763b72 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.cpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.cpp @@ -162,5 +162,59 @@ namespace odgi { } weak_components_map.clear(); } + + /// prepare the weak components map which we will need later so we separate all graph components in 1D + const void generate_and_write_snapshot_graphs(graph_t& graph, + std::vector& weak_components_map, + const std::string& snapshot_prefix, + std::vector& snapshots) { + for (int j = 0; j < snapshots.size(); j++) { + std::string snapshot_file_name = snapshots[j]; + std::ifstream snapshot_instream(snapshot_file_name); + std::vector snapshot_layout; + std::string line; + while(std::getline(snapshot_instream, line)) { + snapshot_layout.push_back(std::stod(line)); + } + snapshot_instream.close(); + uint64_t i = 0; + std::vector snapshot_handle_layout; + graph.for_each_handle( + [&i, &snapshot_layout, &weak_components_map, &snapshot_handle_layout]( + const handle_t &handle) { + snapshot_handle_layout.push_back( + { + weak_components_map[number_bool_packing::unpack_number(handle)], + snapshot_layout[i++], + handle + }); + }); + // sort the graph layout by component, then pos, then handle rank + std::sort(snapshot_handle_layout.begin(), snapshot_handle_layout.end(), + [&](const algorithms::handle_layout_t &a, + const algorithms::handle_layout_t &b) { + return a.weak_component < b.weak_component + || (a.weak_component == b.weak_component + && a.pos < b.pos + || (a.pos == b.pos + && as_integer(a.handle) < as_integer(b.handle))); + }); + std::vector order; + order.reserve(graph.get_node_count()); + for (auto &layout_handle : snapshot_handle_layout) { + order.push_back(layout_handle.handle); + } + std::cerr << "[odgi::path_linear_sgd] Applying order to graph of snapshot: " << std::to_string(j + 1) + << std::endl; + std::string local_snapshot_prefix = snapshot_prefix + std::to_string(j + 1); + auto* graph_copy = new odgi::graph_t(); + utils::graph_deep_copy(graph, graph_copy); + graph_copy->apply_ordering(order, true); + ofstream f(local_snapshot_prefix); + std::cerr << "[odgi::path_linear_sgd] Writing snapshot: " << std::to_string(j + 1) << std::endl; + graph_copy->serialize(f); + f.close(); + } + } } } diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp index 5f92c96a..20615814 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.hpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -46,6 +46,12 @@ namespace odgi { /// prepare the weak components map which we will need later so we separate all graph components in 1D const void prepare_weak_connected_components_map(graph_t& graph, - std::vector& weak_components_map); + std::vector& weak_components_map); + + /// prepare the weak components map which we will need later so we separate all graph components in 1D + const void generate_and_write_snapshot_graphs(graph_t& graph, + std::vector& weak_components_map, + const std::string& snapshot_prefix, + std::vector& snapshots); } } diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 335a3bb6..17a8df27 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -613,53 +613,10 @@ namespace odgi { algorithms::prepare_weak_connected_components_map(graph, weak_components_map); // generate and write snapshot graphs if (snapshot) { - for (int j = 0; j < snapshots.size(); j++) { - std::string snapshot_file_name = snapshots[j]; - std::ifstream snapshot_instream(snapshot_file_name); - std::vector snapshot_layout; - std::string line; - while(std::getline(snapshot_instream, line)) { - snapshot_layout.push_back(std::stod(line)); - } - snapshot_instream.close(); - uint64_t i = 0; - std::vector snapshot_handle_layout; - graph.for_each_handle( - [&i, &snapshot_layout, &weak_components_map, &snapshot_handle_layout]( - const handle_t &handle) { - snapshot_handle_layout.push_back( - { - weak_components_map[number_bool_packing::unpack_number(handle)], - snapshot_layout[i++], - handle - }); - }); - // sort the graph layout by component, then pos, then handle rank - std::sort(snapshot_handle_layout.begin(), snapshot_handle_layout.end(), - [&](const algorithms::handle_layout_t &a, - const algorithms::handle_layout_t &b) { - return a.weak_component < b.weak_component - || (a.weak_component == b.weak_component - && a.pos < b.pos - || (a.pos == b.pos - && as_integer(a.handle) < as_integer(b.handle))); - }); - std::vector order; - order.reserve(graph.get_node_count()); - for (auto &layout_handle : snapshot_handle_layout) { - order.push_back(layout_handle.handle); - } - std::cerr << "[odgi::path_linear_sgd] Applying order to graph of snapshot: " << std::to_string(j + 1) - << std::endl; - std::string local_snapshot_prefix = snapshot_prefix + std::to_string(j + 1); - auto* graph_copy = new odgi::graph_t(); - utils::graph_deep_copy(graph, graph_copy); - graph_copy->apply_ordering(order, true); - ofstream f(local_snapshot_prefix); - std::cerr << "[odgi::path_linear_sgd] Writing snapshot: " << std::to_string(j + 1) << std::endl; - graph_copy->serialize(f); - f.close(); - } + generate_and_write_snapshot_graphs(graph, + weak_components_map, + snapshot_prefix, + snapshots); } // from layout to order std::vector handle_layout; diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.hpp b/src/algorithms/pg_sgd/path_sgd_ssi.hpp index 6f9d7e1d..94f94be0 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.hpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.hpp @@ -85,4 +85,4 @@ namespace odgi { } -} \ No newline at end of file +} From 2ae8a1928966dbddb65b25176385cdbe14a6e3d5 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Thu, 3 Nov 2022 14:57:49 +0100 Subject: [PATCH 28/29] refactor from_layout_to_node_order --- src/algorithms/pg_sgd/path_sgd.cpp | 34 +++++------------------ src/algorithms/pg_sgd/path_sgd_helper.cpp | 31 +++++++++++++++++++++ src/algorithms/pg_sgd/path_sgd_helper.hpp | 6 ++++ src/algorithms/pg_sgd/path_sgd_ssi.cpp | 29 +++---------------- src/subcommand/sort_main.cpp | 9 ++++-- 5 files changed, 55 insertions(+), 54 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd.cpp b/src/algorithms/pg_sgd/path_sgd.cpp index 2da86cb8..384466d6 100644 --- a/src/algorithms/pg_sgd/path_sgd.cpp +++ b/src/algorithms/pg_sgd/path_sgd.cpp @@ -526,33 +526,13 @@ namespace odgi { snapshot_prefix, snapshots); } - std::vector handle_layout; - uint64_t i = 0; - graph.for_each_handle( - [&i, &layout, &weak_components_map, &handle_layout](const handle_t &handle) { - handle_layout.push_back( - { - weak_components_map[number_bool_packing::unpack_number(handle)], - layout[i++], - handle - }); - }); - // sort the graph layout by component, then pos, then handle rank - std::sort(handle_layout.begin(), handle_layout.end(), - [&](const handle_layout_t &a, - const handle_layout_t &b) { - return a.weak_component < b.weak_component - || (a.weak_component == b.weak_component - && a.pos < b.pos - || (a.pos == b.pos - && as_integer(a.handle) < as_integer(b.handle))); - }); - std::vector order; - order.reserve(graph.get_node_count()); - for (auto &layout_handle : handle_layout) { - order.push_back(layout_handle.handle); - } - return order; + // from layout to order + std::vector order; + algorithms::from_layout_to_node_order(graph, + weak_components_map, + order, + layout); + return order; } } } diff --git a/src/algorithms/pg_sgd/path_sgd_helper.cpp b/src/algorithms/pg_sgd/path_sgd_helper.cpp index 9e763b72..0a3a4ad5 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.cpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.cpp @@ -216,5 +216,36 @@ namespace odgi { f.close(); } } + + const void from_layout_to_node_order(graph_t& graph, + std::vector& weak_components_map, + std::vector& order, + std::vector& layout) { + std::vector handle_layout; + uint64_t i = 0; + graph.for_each_handle( + [&i, &layout, &weak_components_map, &handle_layout](const handle_t &handle) { + handle_layout.push_back( + { + weak_components_map[number_bool_packing::unpack_number(handle)], + layout[i++], + handle + }); + }); + // sort the graph layout by component, then pos, then handle rank + std::sort(handle_layout.begin(), handle_layout.end(), + [&](const algorithms::handle_layout_t &a, + const algorithms::handle_layout_t &b) { + return a.weak_component < b.weak_component + || (a.weak_component == b.weak_component + && a.pos < b.pos + || (a.pos == b.pos + && as_integer(a.handle) < as_integer(b.handle))); + }); + order.reserve(graph.get_node_count()); + for (auto &layout_handle : handle_layout) { + order.push_back(layout_handle.handle); + } + } } } diff --git a/src/algorithms/pg_sgd/path_sgd_helper.hpp b/src/algorithms/pg_sgd/path_sgd_helper.hpp index 20615814..cc995eb9 100644 --- a/src/algorithms/pg_sgd/path_sgd_helper.hpp +++ b/src/algorithms/pg_sgd/path_sgd_helper.hpp @@ -53,5 +53,11 @@ namespace odgi { std::vector& weak_components_map, const std::string& snapshot_prefix, std::vector& snapshots); + + /// translate the 1D layout into an actual node order + const void from_layout_to_node_order(graph_t& graph, + std::vector& weak_components_map, + std::vector& order, + std::vector& layout); } } diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 17a8df27..618a8a08 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -619,32 +619,11 @@ namespace odgi { snapshots); } // from layout to order - std::vector handle_layout; - uint64_t i = 0; - graph.for_each_handle( - [&i, &layout, &weak_components_map, &handle_layout](const handle_t &handle) { - handle_layout.push_back( - { - weak_components_map[number_bool_packing::unpack_number(handle)], - layout[i++], - handle - }); - }); - // sort the graph layout by component, then pos, then handle rank - std::sort(handle_layout.begin(), handle_layout.end(), - [&](const algorithms::handle_layout_t &a, - const algorithms::handle_layout_t &b) { - return a.weak_component < b.weak_component - || (a.weak_component == b.weak_component - && a.pos < b.pos - || (a.pos == b.pos - && as_integer(a.handle) < as_integer(b.handle))); - }); std::vector order; - order.reserve(graph.get_node_count()); - for (auto &layout_handle : handle_layout) { - order.push_back(layout_handle.handle); - } + algorithms::from_layout_to_node_order(graph, + weak_components_map, + order, + layout); return order; } } diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 32ebb39d..747acbdf 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -41,6 +41,7 @@ int main_sort(int argc, char** argv) { " ending with *.og* is recommended.", {'o', "out"}); args::Group files_io_opts(parser, "[ Files IO Options ]"); args::ValueFlag xp_in_file(files_io_opts, "FILE", "Load the succinct variation graph index from this *FILE*. The file name usually ends with *.xp*.", {'X', "path-index"}); + /// @Andrea new argument args::ValueFlag ssi_in_file(files_io_opts, "FILE", "Load the sampled step index from this *FILE*. The file name usually ends with *.ssi*.", {'e', "sampled-step-index"}); args::ValueFlag sort_order_in(files_io_opts, "FILE", "*FILE* containing the sort order. Each line contains one node identifer.", {'s', "sort-order"}); args::ValueFlag tmp_base(files_io_opts, "PATH", "directory for temporary files", {'C', "temp-dir"}); @@ -63,6 +64,7 @@ int main_sort(int argc, char** argv) { /// path guided linear 1D SGD args::Group pg_sgd_opts(parser, "[ Path Guided 1D SGD Sort ]"); args::Flag p_sgd(pg_sgd_opts, "path-sgd", "Apply the path-guided linear 1D SGD algorithm to organize graph.", {'Y', "path-sgd"}); + /// @Andrea new argument args::Flag p_sgd_xp(pg_sgd_opts, "path-sgd", "Use the faster but memory expensive XP index.", {'m', "path-sgd-path-index"}); args::ValueFlag p_sgd_in_file(pg_sgd_opts, "FILE", "Specify a line separated list of paths to sample from for the on the" " fly term generation process in the path guided linear 1D SGD (default: sample from all paths).", {'f', "path-sgd-use-paths"}); @@ -269,6 +271,7 @@ int main_sort(int argc, char** argv) { } std::vector is_ref; std::vector target_paths; + /// @Andrea uint64_t ssi_sample_rate = 8; uint64_t sum_path_step_count; if (p_sgd || args::get(pipeline).find('Y') != std::string::npos) { @@ -277,6 +280,7 @@ int main_sort(int argc, char** argv) { algorithms::sort_graph_by_target_paths(graph, target_paths, is_ref, args::get(progress)); } // SSI should be the new default, else we take the XP we received from the input or if the input is an empty string we generate a new one anyhow + /// @Andrea if (xp_in_file) { std::ifstream in; in.open(args::get(xp_in_file)); @@ -287,7 +291,7 @@ int main_sort(int argc, char** argv) { path_index.from_handle_graph(graph, num_threads); // do we load the SSI from file? } else if (ssi_in_file) { - // FIXME we could also only allow an integer as input to build an ssi for the given sample rate + // FIXME we could also only allow an integer as input to build an ssi for the given sample rate? @Andrea sampled_step_index.load(args::get(ssi_in_file)); // store the SSI sampling rate ssi_sample_rate = sampled_step_index.get_sample_rate(); @@ -297,7 +301,7 @@ int main_sort(int argc, char** argv) { sampled_step_index.from_handle_graph(graph, path_sgd_use_paths, num_threads, args::get(progress), ssi_sample_rate); } fresh_step_index = true; - + /// @Andrea sum_path_step_count = algorithms::get_sum_path_step_count(path_sgd_use_paths, graph); if (args::get(p_sgd_min_term_updates_paths)) { path_sgd_min_term_updates = args::get(p_sgd_min_term_updates_paths) * sum_path_step_count; @@ -308,6 +312,7 @@ int main_sort(int argc, char** argv) { path_sgd_min_term_updates = 1.0 * sum_path_step_count; } } + /// @Andrea uint64_t max_path_step_count = algorithms::get_max_path_step_count(path_sgd_use_paths, graph); // we might have to use the SSI here if (xp_way) { From b5afae3f25e3720fb93c7b942c281e869ec97da7 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 7 Nov 2022 11:03:46 +0100 Subject: [PATCH 29/29] take a look in the mirror --- src/algorithms/pg_sgd/path_sgd_ssi.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/algorithms/pg_sgd/path_sgd_ssi.cpp b/src/algorithms/pg_sgd/path_sgd_ssi.cpp index 618a8a08..b147f2e7 100644 --- a/src/algorithms/pg_sgd/path_sgd_ssi.cpp +++ b/src/algorithms/pg_sgd/path_sgd_ssi.cpp @@ -305,15 +305,14 @@ namespace odgi { // FIXME we need to limit the jump length by the given path length? // FIXME I think the problem is how the zetas are created?! // FIXME @Andrea we need to fix this somehow - /* - uint64_t jump_space = std::min(space, s_rank); + + uint64_t jump_space = std::min(space, path_step_count); uint64_t space = jump_space; if (jump_space > space_max){ space = space_max + (jump_space - space_max) / space_quantization_step + 1; } - dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, jump_space, _theta, zetas[space]); FIXME @Andrea - */ - dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_step_count/100, _theta); // 100 for DRB1-3123 data set + dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, jump_space, _theta, zetas[space]); // FIXME @Andrea + // dirtyzipf::dirty_zipfian_int_distribution::param_type z_p(1, path_step_count/100, _theta); // 100 for DRB1-3123 data set dirtyzipf::dirty_zipfian_int_distribution z(z_p); uint64_t z_i = z(gen);