Skip to content

Commit

Permalink
feat: Add node mask parameter to odgi similarity for selective node p…
Browse files Browse the repository at this point in the history
…rocessing
  • Loading branch information
AndreaGuarracino committed Nov 29, 2024
1 parent f4c1671 commit f81edd5
Showing 1 changed file with 29 additions and 0 deletions.
29 changes: 29 additions & 0 deletions src/subcommand/similarity_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ int main_similarity(int argc, char** argv) {
args::ArgumentParser parser("Provides a sparse similarity matrix for paths or groups of paths. Each line prints in a tab-delimited format to stdout.");
args::Group mandatory_opts(parser, "[ MANDATORY ARGUMENTS ]");
args::ValueFlag<std::string> dg_in_file(mandatory_opts, "FILE", "Load the succinct variation graph in ODGI format from this *FILE*. The file name usually ends with *.og*. It also accepts GFAv1, but the on-the-fly conversion to the ODGI format requires additional time!", {'i', "idx"});
args::ValueFlag<std::string> mask_file(mandatory_opts, "FILE", "Load node mask from this file. Each line contains a 0 or 1, where 0 means the node at that position should be ignored in similarity computation.", {'m', "mask"});
args::Group path_investigation_opts(parser, "[ Path Investigation Options ]");
args::ValueFlag<std::string> path_delim(path_investigation_opts, "CHAR", "The part of each path name before this delimiter CHAR is a group identifier.",
{'D', "delim"});
Expand Down Expand Up @@ -198,9 +199,37 @@ args::Group threading_opts(parser, "[ Threading ]");
}

// ska::flat_hash_map<std::pair<uint64_t, uint64_t>, uint64_t> leads to huge memory usage with deep graphs
// Load mask if specified
std::vector<bool> node_mask;
if (mask_file) {
std::ifstream mask_in(args::get(mask_file));
std::string line;
while (std::getline(mask_in, line)) {
if (line == "0") {
node_mask.push_back(false);
} else if (line == "1") {
node_mask.push_back(true);
} else {
std::cerr << "[odgi::similarity] error: mask file should contain only 0 or 1 values, found: " << line << std::endl;
return 1;
}
}
if (node_mask.size() != graph.get_node_count()) {
std::cerr << "[odgi::similarity] error: mask file should have exactly " << graph.get_node_count() << " lines, found: " << node_mask.size() << std::endl;
return 1;
}
} else {
// If no mask specified, include all nodes
node_mask.resize(graph.get_node_count(), true);
}

ska::flat_hash_map<uint64_t, uint64_t> path_intersection_length;
graph.for_each_handle(
[&](const handle_t& h) {
// Skip masked-out nodes
if (!node_mask[graph.get_id(h) - 1]) {
return;
}
ska::flat_hash_map<uint32_t, uint64_t> local_path_lengths;
size_t l = graph.get_length(h);
graph.for_each_step_on_handle(
Expand Down

0 comments on commit f81edd5

Please sign in to comment.