From 5727f3e79a3e80ce185b3b1dd1662e474c0c93cc Mon Sep 17 00:00:00 2001 From: chris Mitchell Date: Tue, 29 Oct 2024 15:53:26 -0400 Subject: [PATCH 1/2] Add convenience methods for graph interface --- src/exports/gfa.rs | 26 ++++++++-------- src/graph.rs | 63 +++++++++++++++++++++++++++++++++++++++ src/models/block_group.rs | 16 ++++++++-- src/models/edge.rs | 37 ++++++++++++++++++++--- src/test_helpers.rs | 27 +++++++++++++++-- 5 files changed, 148 insertions(+), 21 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 923a78e..b830dd4 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -1,11 +1,4 @@ -use itertools::Itertools; -use petgraph::prelude::DiGraphMap; -use rusqlite::{types::Value as SQLValue, Connection}; -use std::collections::{HashMap, HashSet}; -use std::fs::File; -use std::io::{BufWriter, Write}; -use std::path::PathBuf; - +use crate::graph::{GraphEdge, GraphNode}; use crate::models::{ block_group::BlockGroup, block_group_edge::BlockGroupEdge, @@ -16,6 +9,13 @@ use crate::models::{ path_edge::PathEdge, strand::Strand, }; +use itertools::Itertools; +use petgraph::prelude::DiGraphMap; +use rusqlite::{types::Value as SQLValue, Connection}; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; pub fn export_gfa( conn: &Connection, @@ -104,12 +104,14 @@ fn segment_line(sequence: &str, node_id: i64, sequence_start: i64) -> String { fn write_links( writer: &mut BufWriter, - graph: &DiGraphMap, + graph: &DiGraphMap, edges_by_node_pair: &HashMap<(i64, i64), Edge>, node_sequence_starts_by_end_coordinate: HashMap<(i64, i64), i64>, ) { - for (source, target, ()) in graph.all_edges() { - let edge = edges_by_node_pair.get(&(source, target)).unwrap(); + for (source, target, _edge_weight) in graph.all_edges() { + let edge = edges_by_node_pair + .get(&(source.block_id, target.block_id)) + .unwrap(); if Node::is_terminal(edge.source_node_id) || Node::is_terminal(edge.target_node_id) { continue; } @@ -134,7 +136,7 @@ fn write_links( ) .unwrap_or_else(|_| { panic!( - "Error writing link from segment {} to {} to GFA stream", + "Error writing link from segment {:?} to {:?} to GFA stream", source, target, ) }); diff --git a/src/graph.rs b/src/graph.rs index 4afbf72..35c7b40 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -4,6 +4,27 @@ use std::iter::from_fn; use petgraph::visit::{IntoNeighborsDirected, NodeCount}; use petgraph::Direction; +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)] +pub struct GraphNode { + pub block_id: i64, + pub node_id: i64, + pub sequence_start: i64, + pub sequence_end: i64, +} + +impl GraphNode { + pub fn length(&self) -> i64 { + self.sequence_end - self.sequence_start + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)] +pub struct GraphEdge { + pub edge_id: i64, + pub chromosome_index: i64, + pub phased: i64, +} + // hacked from https://docs.rs/petgraph/latest/src/petgraph/algo/simple_paths.rs.html#36-102 to support digraphmap pub fn all_simple_paths( @@ -161,4 +182,46 @@ mod tests { HashSet::from_iter(expected_paths) ); } + + #[test] + fn test_super_bubble_path() { + // This graph looks like this: + // 8 + // / \ + // 6 -> 7 + // / \ + // 1 -> 2 -> 3 -> 4 -> 5 + // + // We ensure that we capture all 3 paths from 1 -> 5 + let mut graph: DiGraphMap = DiGraphMap::new(); + graph.add_node(1); + graph.add_node(2); + graph.add_node(3); + graph.add_node(4); + graph.add_node(5); + graph.add_node(6); + graph.add_node(7); + graph.add_node(8); + graph.add_node(9); + + graph.add_edge(1, 2, ()); + graph.add_edge(2, 3, ()); + graph.add_edge(3, 4, ()); + graph.add_edge(4, 5, ()); + graph.add_edge(2, 6, ()); + graph.add_edge(6, 7, ()); + graph.add_edge(7, 4, ()); + graph.add_edge(6, 8, ()); + graph.add_edge(8, 7, ()); + + let paths = all_simple_paths(&graph, 1, 5).collect::>>(); + assert_eq!( + HashSet::>::from_iter::>>(paths), + HashSet::from_iter(vec![ + vec![1, 2, 3, 4, 5], + vec![1, 2, 6, 7, 4, 5], + vec![1, 2, 6, 8, 7, 4, 5] + ]) + ); + } } diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 3406059..f4cd93d 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -2,11 +2,12 @@ use std::collections::{HashMap, HashSet}; use intervaltree::IntervalTree; use itertools::Itertools; +use petgraph::graphmap::DiGraphMap; use petgraph::Direction; use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; use serde::{Deserialize, Serialize}; -use crate::graph::all_simple_paths; +use crate::graph::{all_simple_paths, GraphEdge, GraphNode}; use crate::models::accession::{Accession, AccessionEdge, AccessionEdgeData, AccessionPath}; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData, GroupBlock}; @@ -308,6 +309,15 @@ impl BlockGroup { } } + pub fn get_graph(conn: &Connection, block_group_id: i64) -> DiGraphMap { + let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id); + let blocks = Edge::blocks_from_edges(conn, &edges); + let boundary_edges = Edge::boundary_edges_from_sequences(&blocks); + edges.extend(boundary_edges.clone()); + let (graph, _) = Edge::build_graph(&edges, &blocks); + graph + } + pub fn get_all_sequences(conn: &Connection, block_group_id: i64) -> HashSet { let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id); let blocks = Edge::blocks_from_edges(conn, &edges); @@ -339,7 +349,7 @@ impl BlockGroup { for end_node in &end_nodes { // TODO: maybe make all_simple_paths return a single path id where start == end if start_node == *end_node { - let block = blocks_by_id.get(&start_node).unwrap(); + let block = blocks_by_id.get(&start_node.block_id).unwrap(); if block.node_id != PATH_START_NODE_ID && block.node_id != PATH_END_NODE_ID { sequences.insert(block.sequence()); } @@ -347,7 +357,7 @@ impl BlockGroup { for path in all_simple_paths(&graph, start_node, *end_node) { let mut current_sequence = "".to_string(); for node in path { - let block = blocks_by_id.get(&node).unwrap(); + let block = blocks_by_id.get(&node.block_id).unwrap(); let block_sequence = block.sequence(); current_sequence.push_str(&block_sequence); } diff --git a/src/models/edge.rs b/src/models/edge.rs index 4e8414c..78e0f56 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -6,6 +6,7 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::hash::{Hash, RandomState}; +use crate::graph::{GraphEdge, GraphNode}; use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::sequence::{cached_sequence, Sequence}; use crate::models::strand::Strand; @@ -412,7 +413,7 @@ impl Edge { pub fn build_graph( edges: &Vec, blocks: &Vec, - ) -> (DiGraphMap, HashMap<(i64, i64), Edge>) { + ) -> (DiGraphMap, HashMap<(i64, i64), Edge>) { let blocks_by_start = blocks .clone() .into_iter() @@ -439,11 +440,21 @@ impl Edge { ) }) .collect::>(); + let block_coordinates = blocks + .clone() + .into_iter() + .map(|block| (block.id, (block.start, block.end))) + .collect::>(); - let mut graph: DiGraphMap = DiGraphMap::new(); + let mut graph: DiGraphMap = DiGraphMap::new(); let mut edges_by_node_pair = HashMap::new(); for block in blocks { - graph.add_node(block.id); + graph.add_node(GraphNode { + block_id: block.id, + node_id: block.node_id, + sequence_start: block.start, + sequence_end: block.end, + }); } for edge in edges { let source_key = BlockKey { @@ -459,7 +470,25 @@ impl Edge { if let Some(source_id_value) = source_id { if let Some(target_id_value) = target_id { - graph.add_edge(*source_id_value, *target_id_value, ()); + graph.add_edge( + GraphNode { + block_id: *source_id_value, + node_id: edge.source_node_id, + sequence_start: block_coordinates[source_id_value].0, + sequence_end: block_coordinates[source_id_value].1, + }, + GraphNode { + block_id: *target_id_value, + node_id: edge.target_node_id, + sequence_start: block_coordinates[target_id_value].0, + sequence_end: block_coordinates[target_id_value].1, + }, + GraphEdge { + chromosome_index: edge.chromosome_index, + phased: edge.phased, + edge_id: edge.id, + }, + ); edges_by_node_pair.insert((*source_id_value, *target_id_value), edge.clone()); } } diff --git a/src/test_helpers.rs b/src/test_helpers.rs index a5027c1..61d6a22 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -1,9 +1,11 @@ -use std::fs; - +use petgraph::graphmap::DiGraphMap; use rusqlite::Connection; +use std::fs; +use std::io::Write; use tempdir::TempDir; use crate::config::{get_or_create_gen_dir, BASE_DIR}; +use crate::graph::{GraphEdge, GraphNode}; use crate::migrations::{run_migrations, run_operation_migrations}; use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; @@ -149,3 +151,24 @@ pub fn setup_block_group(conn: &Connection) -> (i64, Path) { ); (block_group.id, path) } + +pub fn save_graph(graph: &DiGraphMap, path: &str) { + use petgraph::dot::{Config, Dot}; + use std::fs::File; + let mut file = File::create(path).unwrap(); + file.write_all( + format!( + "{dot:?}", + dot = Dot::with_attr_getters( + &graph, + &[Config::NodeNoLabel, Config::EdgeNoLabel], + &|_, (_, _, edge_weight)| format!("label = \"{}\"", edge_weight.chromosome_index), + &|_, (node, weight)| format!( + "label = \"{}[{}-{}]\"", + node.node_id, node.sequence_start, node.sequence_end + ), + ) + ) + .as_bytes(), + ); +} From 065e00079e4dca73b186078ae9e4c21b707f2799 Mon Sep 17 00:00:00 2001 From: chris Mitchell Date: Tue, 29 Oct 2024 16:09:57 -0400 Subject: [PATCH 2/2] Simplify write_links --- src/exports/gfa.rs | 45 ++++++++++---------------------------------- src/graph.rs | 3 +++ src/models/edge.rs | 2 ++ src/models/strand.rs | 2 +- 4 files changed, 16 insertions(+), 36 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index b830dd4..895b334 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -66,18 +66,8 @@ pub fn export_gfa( let file = File::create(filename).unwrap(); let mut writer = BufWriter::new(file); - let node_sequence_starts_by_end_coordinate = blocks - .iter() - .filter(|block| !Node::is_terminal(block.node_id)) - .map(|block| ((block.node_id, block.end), block.start)) - .collect::>(); write_segments(&mut writer, &blocks); - write_links( - &mut writer, - &graph, - &edges_by_node_pair, - node_sequence_starts_by_end_coordinate, - ); + write_links(&mut writer, &graph); write_paths(&mut writer, conn, collection_name, &blocks); } @@ -102,35 +92,20 @@ fn segment_line(sequence: &str, node_id: i64, sequence_start: i64) -> String { format!("S\t{}.{}\t{}\t*\n", node_id, sequence_start, sequence,) } -fn write_links( - writer: &mut BufWriter, - graph: &DiGraphMap, - edges_by_node_pair: &HashMap<(i64, i64), Edge>, - node_sequence_starts_by_end_coordinate: HashMap<(i64, i64), i64>, -) { - for (source, target, _edge_weight) in graph.all_edges() { - let edge = edges_by_node_pair - .get(&(source.block_id, target.block_id)) - .unwrap(); - if Node::is_terminal(edge.source_node_id) || Node::is_terminal(edge.target_node_id) { +fn write_links(writer: &mut BufWriter, graph: &DiGraphMap) { + for (source, target, edge_info) in graph.all_edges() { + if Node::is_terminal(source.node_id) || Node::is_terminal(target.node_id) { continue; } - // Since we're encoding a segment ID as node ID + sequence start coordinate, we need to do - // one step of translation to get that for an edge's source. The edge's source is the node - // ID + sequence end coordinate, so the following line converts that to the sequence start - // coordinate. - let sequence_start = node_sequence_starts_by_end_coordinate - .get(&(edge.source_node_id, edge.source_coordinate)) - .unwrap(); writer .write_all( &link_line( - edge.source_node_id, - *sequence_start, - edge.source_strand, - edge.target_node_id, - edge.target_coordinate, - edge.target_strand, + source.node_id, + source.sequence_start, + edge_info.source_strand, + target.node_id, + target.sequence_start, + edge_info.target_strand, ) .into_bytes(), ) diff --git a/src/graph.rs b/src/graph.rs index 35c7b40..6e44c94 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -1,6 +1,7 @@ use std::hash::Hash; use std::iter::from_fn; +use crate::models::strand::Strand; use petgraph::visit::{IntoNeighborsDirected, NodeCount}; use petgraph::Direction; @@ -23,6 +24,8 @@ pub struct GraphEdge { pub edge_id: i64, pub chromosome_index: i64, pub phased: i64, + pub source_strand: Strand, + pub target_strand: Strand, } // hacked from https://docs.rs/petgraph/latest/src/petgraph/algo/simple_paths.rs.html#36-102 to support digraphmap diff --git a/src/models/edge.rs b/src/models/edge.rs index 78e0f56..20152ce 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -487,6 +487,8 @@ impl Edge { chromosome_index: edge.chromosome_index, phased: edge.phased, edge_id: edge.id, + source_strand: edge.source_strand, + target_strand: edge.target_strand, }, ); edges_by_node_pair.insert((*source_id_value, *target_id_value), edge.clone()); diff --git a/src/models/strand.rs b/src/models/strand.rs index a2d3697..6fdd276 100644 --- a/src/models/strand.rs +++ b/src/models/strand.rs @@ -3,7 +3,7 @@ use rusqlite::ToSql; use serde::{Deserialize, Serialize}; use std::fmt; -#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize, Ord, PartialOrd)] pub enum Strand { Forward, Reverse,