Skip to content

Commit

Permalink
Merge pull request #80 from ginkgobioworks/graph-convenience
Browse files Browse the repository at this point in the history
Add convenience methods for graph interface
  • Loading branch information
Chris7 authored Oct 30, 2024
2 parents c137133 + 065e000 commit 0b41e0e
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 52 deletions.
61 changes: 19 additions & 42 deletions src/exports/gfa.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
use itertools::Itertools;
use petgraph::prelude::DiGraphMap;
use rusqlite::{types::Value as SQLValue, Connection};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::PathBuf;

use crate::graph::{GraphEdge, GraphNode};
use crate::models::{
block_group::BlockGroup,
block_group_edge::BlockGroupEdge,
Expand All @@ -16,6 +9,13 @@ use crate::models::{
path_edge::PathEdge,
strand::Strand,
};
use itertools::Itertools;
use petgraph::prelude::DiGraphMap;
use rusqlite::{types::Value as SQLValue, Connection};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::PathBuf;

pub fn export_gfa(
conn: &Connection,
Expand Down Expand Up @@ -66,18 +66,8 @@ pub fn export_gfa(
let file = File::create(filename).unwrap();
let mut writer = BufWriter::new(file);

let node_sequence_starts_by_end_coordinate = blocks
.iter()
.filter(|block| !Node::is_terminal(block.node_id))
.map(|block| ((block.node_id, block.end), block.start))
.collect::<HashMap<(i64, i64), i64>>();
write_segments(&mut writer, &blocks);
write_links(
&mut writer,
&graph,
&edges_by_node_pair,
node_sequence_starts_by_end_coordinate,
);
write_links(&mut writer, &graph);
write_paths(&mut writer, conn, collection_name, &blocks);
}

Expand All @@ -102,39 +92,26 @@ fn segment_line(sequence: &str, node_id: i64, sequence_start: i64) -> String {
format!("S\t{}.{}\t{}\t*\n", node_id, sequence_start, sequence,)
}

fn write_links(
writer: &mut BufWriter<File>,
graph: &DiGraphMap<i64, ()>,
edges_by_node_pair: &HashMap<(i64, i64), Edge>,
node_sequence_starts_by_end_coordinate: HashMap<(i64, i64), i64>,
) {
for (source, target, ()) in graph.all_edges() {
let edge = edges_by_node_pair.get(&(source, target)).unwrap();
if Node::is_terminal(edge.source_node_id) || Node::is_terminal(edge.target_node_id) {
fn write_links(writer: &mut BufWriter<File>, graph: &DiGraphMap<GraphNode, GraphEdge>) {
for (source, target, edge_info) in graph.all_edges() {
if Node::is_terminal(source.node_id) || Node::is_terminal(target.node_id) {
continue;
}
// Since we're encoding a segment ID as node ID + sequence start coordinate, we need to do
// one step of translation to get that for an edge's source. The edge's source is the node
// ID + sequence end coordinate, so the following line converts that to the sequence start
// coordinate.
let sequence_start = node_sequence_starts_by_end_coordinate
.get(&(edge.source_node_id, edge.source_coordinate))
.unwrap();
writer
.write_all(
&link_line(
edge.source_node_id,
*sequence_start,
edge.source_strand,
edge.target_node_id,
edge.target_coordinate,
edge.target_strand,
source.node_id,
source.sequence_start,
edge_info.source_strand,
target.node_id,
target.sequence_start,
edge_info.target_strand,
)
.into_bytes(),
)
.unwrap_or_else(|_| {
panic!(
"Error writing link from segment {} to {} to GFA stream",
"Error writing link from segment {:?} to {:?} to GFA stream",
source, target,
)
});
Expand Down
66 changes: 66 additions & 0 deletions src/graph.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
use std::hash::Hash;
use std::iter::from_fn;

use crate::models::strand::Strand;
use petgraph::visit::{IntoNeighborsDirected, NodeCount};
use petgraph::Direction;

#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
pub struct GraphNode {
pub block_id: i64,
pub node_id: i64,
pub sequence_start: i64,
pub sequence_end: i64,
}

impl GraphNode {
pub fn length(&self) -> i64 {
self.sequence_end - self.sequence_start
}
}

#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
pub struct GraphEdge {
pub edge_id: i64,
pub chromosome_index: i64,
pub phased: i64,
pub source_strand: Strand,
pub target_strand: Strand,
}

// hacked from https://docs.rs/petgraph/latest/src/petgraph/algo/simple_paths.rs.html#36-102 to support digraphmap

pub fn all_simple_paths<G>(
Expand Down Expand Up @@ -161,4 +185,46 @@ mod tests {
HashSet::from_iter(expected_paths)
);
}

#[test]
fn test_super_bubble_path() {
// This graph looks like this:
// 8
// / \
// 6 -> 7
// / \
// 1 -> 2 -> 3 -> 4 -> 5
//
// We ensure that we capture all 3 paths from 1 -> 5
let mut graph: DiGraphMap<i64, ()> = DiGraphMap::new();
graph.add_node(1);
graph.add_node(2);
graph.add_node(3);
graph.add_node(4);
graph.add_node(5);
graph.add_node(6);
graph.add_node(7);
graph.add_node(8);
graph.add_node(9);

graph.add_edge(1, 2, ());
graph.add_edge(2, 3, ());
graph.add_edge(3, 4, ());
graph.add_edge(4, 5, ());
graph.add_edge(2, 6, ());
graph.add_edge(6, 7, ());
graph.add_edge(7, 4, ());
graph.add_edge(6, 8, ());
graph.add_edge(8, 7, ());

let paths = all_simple_paths(&graph, 1, 5).collect::<Vec<Vec<i64>>>();
assert_eq!(
HashSet::<Vec<i64>>::from_iter::<Vec<Vec<i64>>>(paths),
HashSet::from_iter(vec![
vec![1, 2, 3, 4, 5],
vec![1, 2, 6, 7, 4, 5],
vec![1, 2, 6, 8, 7, 4, 5]
])
);
}
}
16 changes: 13 additions & 3 deletions src/models/block_group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ use std::collections::{HashMap, HashSet};

use intervaltree::IntervalTree;
use itertools::Itertools;
use petgraph::graphmap::DiGraphMap;
use petgraph::Direction;
use rusqlite::{params_from_iter, types::Value as SQLValue, Connection};
use serde::{Deserialize, Serialize};

use crate::graph::all_simple_paths;
use crate::graph::{all_simple_paths, GraphEdge, GraphNode};
use crate::models::accession::{Accession, AccessionEdge, AccessionEdgeData, AccessionPath};
use crate::models::block_group_edge::BlockGroupEdge;
use crate::models::edge::{Edge, EdgeData, GroupBlock};
Expand Down Expand Up @@ -308,6 +309,15 @@ impl BlockGroup {
}
}

pub fn get_graph(conn: &Connection, block_group_id: i64) -> DiGraphMap<GraphNode, GraphEdge> {
let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id);
let blocks = Edge::blocks_from_edges(conn, &edges);
let boundary_edges = Edge::boundary_edges_from_sequences(&blocks);
edges.extend(boundary_edges.clone());
let (graph, _) = Edge::build_graph(&edges, &blocks);
graph
}

pub fn get_all_sequences(conn: &Connection, block_group_id: i64) -> HashSet<String> {
let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id);
let blocks = Edge::blocks_from_edges(conn, &edges);
Expand Down Expand Up @@ -339,15 +349,15 @@ impl BlockGroup {
for end_node in &end_nodes {
// TODO: maybe make all_simple_paths return a single path id where start == end
if start_node == *end_node {
let block = blocks_by_id.get(&start_node).unwrap();
let block = blocks_by_id.get(&start_node.block_id).unwrap();
if block.node_id != PATH_START_NODE_ID && block.node_id != PATH_END_NODE_ID {
sequences.insert(block.sequence());
}
} else {
for path in all_simple_paths(&graph, start_node, *end_node) {
let mut current_sequence = "".to_string();
for node in path {
let block = blocks_by_id.get(&node).unwrap();
let block = blocks_by_id.get(&node.block_id).unwrap();
let block_sequence = block.sequence();
current_sequence.push_str(&block_sequence);
}
Expand Down
39 changes: 35 additions & 4 deletions src/models/edge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, RandomState};

use crate::graph::{GraphEdge, GraphNode};
use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID};
use crate::models::sequence::{cached_sequence, Sequence};
use crate::models::strand::Strand;
Expand Down Expand Up @@ -412,7 +413,7 @@ impl Edge {
pub fn build_graph(
edges: &Vec<Edge>,
blocks: &Vec<GroupBlock>,
) -> (DiGraphMap<i64, ()>, HashMap<(i64, i64), Edge>) {
) -> (DiGraphMap<GraphNode, GraphEdge>, HashMap<(i64, i64), Edge>) {
let blocks_by_start = blocks
.clone()
.into_iter()
Expand All @@ -439,11 +440,21 @@ impl Edge {
)
})
.collect::<HashMap<BlockKey, i64>>();
let block_coordinates = blocks
.clone()
.into_iter()
.map(|block| (block.id, (block.start, block.end)))
.collect::<HashMap<i64, (i64, i64)>>();

let mut graph: DiGraphMap<i64, ()> = DiGraphMap::new();
let mut graph: DiGraphMap<GraphNode, GraphEdge> = DiGraphMap::new();
let mut edges_by_node_pair = HashMap::new();
for block in blocks {
graph.add_node(block.id);
graph.add_node(GraphNode {
block_id: block.id,
node_id: block.node_id,
sequence_start: block.start,
sequence_end: block.end,
});
}
for edge in edges {
let source_key = BlockKey {
Expand All @@ -459,7 +470,27 @@ impl Edge {

if let Some(source_id_value) = source_id {
if let Some(target_id_value) = target_id {
graph.add_edge(*source_id_value, *target_id_value, ());
graph.add_edge(
GraphNode {
block_id: *source_id_value,
node_id: edge.source_node_id,
sequence_start: block_coordinates[source_id_value].0,
sequence_end: block_coordinates[source_id_value].1,
},
GraphNode {
block_id: *target_id_value,
node_id: edge.target_node_id,
sequence_start: block_coordinates[target_id_value].0,
sequence_end: block_coordinates[target_id_value].1,
},
GraphEdge {
chromosome_index: edge.chromosome_index,
phased: edge.phased,
edge_id: edge.id,
source_strand: edge.source_strand,
target_strand: edge.target_strand,
},
);
edges_by_node_pair.insert((*source_id_value, *target_id_value), edge.clone());
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/models/strand.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use rusqlite::ToSql;
use serde::{Deserialize, Serialize};
use std::fmt;

#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize, Ord, PartialOrd)]
pub enum Strand {
Forward,
Reverse,
Expand Down
27 changes: 25 additions & 2 deletions src/test_helpers.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use std::fs;

use petgraph::graphmap::DiGraphMap;
use rusqlite::Connection;
use std::fs;
use std::io::Write;
use tempdir::TempDir;

use crate::config::{get_or_create_gen_dir, BASE_DIR};
use crate::graph::{GraphEdge, GraphNode};
use crate::migrations::{run_migrations, run_operation_migrations};
use crate::models::block_group::BlockGroup;
use crate::models::block_group_edge::BlockGroupEdge;
Expand Down Expand Up @@ -149,3 +151,24 @@ pub fn setup_block_group(conn: &Connection) -> (i64, Path) {
);
(block_group.id, path)
}

pub fn save_graph(graph: &DiGraphMap<GraphNode, GraphEdge>, path: &str) {
use petgraph::dot::{Config, Dot};
use std::fs::File;
let mut file = File::create(path).unwrap();
file.write_all(
format!(
"{dot:?}",
dot = Dot::with_attr_getters(
&graph,
&[Config::NodeNoLabel, Config::EdgeNoLabel],
&|_, (_, _, edge_weight)| format!("label = \"{}\"", edge_weight.chromosome_index),
&|_, (node, weight)| format!(
"label = \"{}[{}-{}]\"",
node.node_id, node.sequence_start, node.sequence_end
),
)
)
.as_bytes(),
);
}

0 comments on commit 0b41e0e

Please sign in to comment.