From f81edd575f9c96a48027f6e53009978ce9ccd6ee Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 28 Nov 2024 21:22:18 -0600 Subject: [PATCH] feat: Add node mask parameter to odgi similarity for selective node processing --- src/subcommand/similarity_main.cpp | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/subcommand/similarity_main.cpp b/src/subcommand/similarity_main.cpp index da913e3c..ae0fb510 100644 --- a/src/subcommand/similarity_main.cpp +++ b/src/subcommand/similarity_main.cpp @@ -31,6 +31,7 @@ int main_similarity(int argc, char** argv) { args::ArgumentParser parser("Provides a sparse similarity matrix for paths or groups of paths. Each line prints in a tab-delimited format to stdout."); args::Group mandatory_opts(parser, "[ MANDATORY ARGUMENTS ]"); args::ValueFlag dg_in_file(mandatory_opts, "FILE", "Load the succinct variation graph in ODGI format from this *FILE*. The file name usually ends with *.og*. It also accepts GFAv1, but the on-the-fly conversion to the ODGI format requires additional time!", {'i', "idx"}); + args::ValueFlag mask_file(mandatory_opts, "FILE", "Load node mask from this file. Each line contains a 0 or 1, where 0 means the node at that position should be ignored in similarity computation.", {'m', "mask"}); args::Group path_investigation_opts(parser, "[ Path Investigation Options ]"); args::ValueFlag path_delim(path_investigation_opts, "CHAR", "The part of each path name before this delimiter CHAR is a group identifier.", {'D', "delim"}); @@ -198,9 +199,37 @@ args::Group threading_opts(parser, "[ Threading ]"); } // ska::flat_hash_map, uint64_t> leads to huge memory usage with deep graphs + // Load mask if specified + std::vector node_mask; + if (mask_file) { + std::ifstream mask_in(args::get(mask_file)); + std::string line; + while (std::getline(mask_in, line)) { + if (line == "0") { + node_mask.push_back(false); + } else if (line == "1") { + node_mask.push_back(true); + } else { + std::cerr << "[odgi::similarity] error: mask file should contain only 0 or 1 values, found: " << line << std::endl; + return 1; + } + } + if (node_mask.size() != graph.get_node_count()) { + std::cerr << "[odgi::similarity] error: mask file should have exactly " << graph.get_node_count() << " lines, found: " << node_mask.size() << std::endl; + return 1; + } + } else { + // If no mask specified, include all nodes + node_mask.resize(graph.get_node_count(), true); + } + ska::flat_hash_map path_intersection_length; graph.for_each_handle( [&](const handle_t& h) { + // Skip masked-out nodes + if (!node_mask[graph.get_id(h) - 1]) { + return; + } ska::flat_hash_map local_path_lengths; size_t l = graph.get_length(h); graph.for_each_step_on_handle(