Skip to content

Commit

Permalink
Merge pull request #69 from ginkgobioworks/more-gfa-export-improvements
Browse files Browse the repository at this point in the history
Fix boundary edge bug
  • Loading branch information
dkhofer authored Oct 18, 2024
2 parents e8a030d + efbc9b6 commit dd8450c
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 6 deletions.
120 changes: 118 additions & 2 deletions src/exports/gfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ pub fn export_gfa(

let mut edges = edge_set.into_iter().collect();
let (blocks, boundary_edges) = Edge::blocks_from_edges(conn, &edges);

edges.extend(boundary_edges.clone());

let (graph, edges_by_node_pair) = Edge::build_graph(&edges, &blocks);
Expand Down Expand Up @@ -255,13 +256,14 @@ mod tests {

use crate::imports::gfa::import_gfa;
use crate::models::{
block_group::BlockGroup,
block_group::{BlockGroup, PathChange},
collection::Collection,
node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID},
path::PathBlock,
sequence::Sequence,
};

use crate::test_helpers::{get_connection, setup_gen_dir};
use crate::test_helpers::{get_connection, setup_block_group, setup_gen_dir};
use tempfile::tempdir;

#[test]
Expand Down Expand Up @@ -464,4 +466,118 @@ mod tests {

assert_eq!(all_sequences, all_sequences2);
}

#[test]
fn test_sequence_is_split_into_multiple_segments() {
// Confirm that if edges are added to or from a sequence, that results in the sequence being
// split into multiple segments in the exported GFA, and that the multiple segments are
// re-imported as multiple sequences
let conn = get_connection(None);
let (block_group_id, path) = setup_block_group(&conn);
let insert_sequence = Sequence::new()
.sequence_type("DNA")
.sequence("NNNN")
.save(&conn);
let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str(), None);
let insert = PathBlock {
id: 0,
node_id: insert_node_id,
block_sequence: insert_sequence.get_sequence(0, 4).to_string(),
sequence_start: 0,
sequence_end: 4,
path_start: 7,
path_end: 15,
strand: Strand::Forward,
};
let change = PathChange {
block_group_id,
path: path.clone(),
start: 7,
end: 15,
block: insert,
chromosome_index: 1,
phased: 0,
};
let tree = Path::intervaltree_for(&conn, &path);
BlockGroup::insert_change(&conn, &change, &tree);

let edges = BlockGroupEdge::edges_for_block_group(&conn, block_group_id);
let mut node_ids = HashSet::new();
let mut edge_ids = HashSet::new();
for edge in edges {
if !Node::is_terminal(edge.source_node_id) {
node_ids.insert(edge.source_node_id);
}
if !Node::is_terminal(edge.target_node_id) {
node_ids.insert(edge.target_node_id);
}
if !Node::is_terminal(edge.source_node_id) && !Node::is_terminal(edge.target_node_id) {
edge_ids.insert(edge.id);
}
}

// The original 10-length A, T, C, G sequences, plus NNNN
assert_eq!(node_ids.len(), 5);
// 3 edges from A sequence -> T sequence, T sequence -> C sequence, C sequence -> G sequence
// 2 edges to and from NNNN
// 5 total
assert_eq!(edge_ids.len(), 5);

let nodes = Node::get_nodes(&conn, node_ids.into_iter().collect::<Vec<i64>>());
let mut node_hashes = HashSet::new();
for node in nodes {
if !Node::is_terminal(node.id) {
node_hashes.insert(node.sequence_hash);
}
}

// The original 10-length A, T, C, G sequences, plus NNNN
assert_eq!(node_hashes.len(), 5);

let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
let mut gfa_path = PathBuf::from(temp_dir.path());
gfa_path.push("intermediate.gfa");
export_gfa(&conn, "test", &gfa_path, None);
import_gfa(&gfa_path, "test collection 2", &conn);

let block_group2 = Collection::get_block_groups(&conn, "test collection 2")
.pop()
.unwrap();

let edges2 = BlockGroupEdge::edges_for_block_group(&conn, block_group2.id);
let mut node_ids2 = HashSet::new();
let mut edge_ids2 = HashSet::new();
for edge in edges2 {
if !Node::is_terminal(edge.source_node_id) {
node_ids2.insert(edge.source_node_id);
}
if !Node::is_terminal(edge.target_node_id) {
node_ids2.insert(edge.target_node_id);
}
if !Node::is_terminal(edge.source_node_id) && !Node::is_terminal(edge.target_node_id) {
edge_ids2.insert(edge.id);
}
}

// The 10-length A and T sequences have now been split in two (showing up as different
// segments in the exported GFA), so expect two more nodes
assert_eq!(node_ids2.len(), 7);
// 3 edges from A sequence -> T sequence, T sequence -> C sequence, C sequence -> G sequence
// 2 boundary edges (exported as real links) in A sequence and T sequence
// 2 edges to and from NNNN
// 7 total
assert_eq!(edge_ids2.len(), 7);

let nodes2 = Node::get_nodes(&conn, node_ids2.into_iter().collect::<Vec<i64>>());
let mut node_hashes2 = HashSet::new();
for node in nodes2 {
if !Node::is_terminal(node.id) {
node_hashes2.insert(node.sequence_hash);
}
}

// The 10-length A and T sequences have now been split in two, but since the T sequences was
// split in half, there's just one new TTTTT sequence shared by 2 nodes
assert_eq!(node_hashes2.len(), 6);
}
}
168 changes: 164 additions & 4 deletions src/models/edge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,14 +330,14 @@ impl Edge {
edges_by_source_node_id
.entry(edge.source_node_id)
.and_modify(|edges| edges.push(edge))
.or_default();
.or_insert(vec![edge]);
}
if edge.target_node_id != PATH_END_NODE_ID {
node_ids.insert(edge.target_node_id);
edges_by_target_node_id
.entry(edge.target_node_id)
.and_modify(|edges| edges.push(edge))
.or_default();
.or_insert(vec![edge]);
}
}

Expand Down Expand Up @@ -489,8 +489,14 @@ impl Edge {
mod tests {
// Note this useful idiom: importing names from outer (for mod tests) scope.
use super::*;
use crate::models::{collection::Collection, sequence::Sequence};
use crate::test_helpers::get_connection;
use crate::models::{
block_group::{BlockGroup, PathChange},
block_group_edge::BlockGroupEdge,
collection::Collection,
path::{Path, PathBlock},
sequence::Sequence,
};
use crate::test_helpers::{get_connection, setup_block_group};

#[test]
fn test_bulk_create() {
Expand Down Expand Up @@ -724,4 +730,158 @@ mod tests {
assert_eq!(edge_result3.target_node_id, PATH_END_NODE_ID);
assert_eq!(edge_result3.target_coordinate, -1);
}

#[test]
fn test_blocks_from_edges() {
let conn = get_connection(None);
let (block_group_id, path) = setup_block_group(&conn);

let edges = BlockGroupEdge::edges_for_block_group(&conn, block_group_id);
let (blocks, boundary_edges) = Edge::blocks_from_edges(&conn, &edges);
// 4 actual sequences: 10-length ones of all A, all T, all C, all G
// 2 terminal node blocks (start/end)
// 6 total
assert_eq!(blocks.len(), 6);
assert_eq!(boundary_edges.len(), 0);

let insert_sequence = Sequence::new()
.sequence_type("DNA")
.sequence("NNNN")
.save(&conn);
let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str(), None);
let insert = PathBlock {
id: 0,
node_id: insert_node_id,
block_sequence: insert_sequence.get_sequence(0, 4).to_string(),
sequence_start: 0,
sequence_end: 4,
path_start: 7,
path_end: 15,
strand: Strand::Forward,
};
let change = PathChange {
block_group_id,
path: path.clone(),
start: 7,
end: 15,
block: insert,
chromosome_index: 1,
phased: 0,
};
let tree = Path::intervaltree_for(&conn, &path);
BlockGroup::insert_change(&conn, &change, &tree);
let mut edges = BlockGroupEdge::edges_for_block_group(&conn, block_group_id);

let (blocks, boundary_edges) = Edge::blocks_from_edges(&conn, &edges);
// 2 10-length sequences of all C, all G
// 1 inserted NNNN sequence
// 4 split blocks (A and T sequences were split) resulting from the inserted sequence
// 2 terminal node blocks (start/end)
// 9 total
assert_eq!(blocks.len(), 9);
assert_eq!(boundary_edges.len(), 2);

// Confirm that ordering doesn't matter
edges.reverse();
let (blocks, boundary_edges) = Edge::blocks_from_edges(&conn, &edges);
// 2 10-length sequences of all C, all G
// 1 inserted NNNN sequence
// 4 split blocks (A and T sequences were split) resulting from the inserted sequence
// 2 terminal node blocks (start/end)
// 9 total
assert_eq!(blocks.len(), 9);
assert_eq!(boundary_edges.len(), 2);
}

#[test]
fn test_get_block_boundaries() {
let conn = get_connection(None);
let template_sequence = Sequence::new()
.sequence_type("DNA")
.sequence("AAAAAAAAAA")
.save(&conn);
let template_node_id = Node::create(&conn, template_sequence.hash.as_str(), None);

let insert_sequence = Sequence::new()
.sequence_type("DNA")
.sequence("NNNN")
.save(&conn);
let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str(), None);

let edge1 = Edge::create(
&conn,
template_node_id,
2,
Strand::Forward,
insert_node_id,
0,
Strand::Forward,
0,
0,
);
let edge2 = Edge::create(
&conn,
insert_node_id,
4,
Strand::Forward,
template_node_id,
3,
Strand::Forward,
0,
0,
);

let boundaries = Edge::get_block_boundaries(Some(&vec![&edge1]), Some(&vec![&edge2]), 10);
assert_eq!(boundaries, vec![2, 3]);
}

#[test]
fn test_get_block_boundaries_with_two_original_sequences() {
let conn = get_connection(None);
let template_sequence1 = Sequence::new()
.sequence_type("DNA")
.sequence("AAAAAAAAAA")
.save(&conn);
let template1_node_id = Node::create(&conn, template_sequence1.hash.as_str(), None);

let template_sequence2 = Sequence::new()
.sequence_type("DNA")
.sequence("TTTTTTTTTT")
.save(&conn);
let template2_node_id = Node::create(&conn, template_sequence2.hash.as_str(), None);

let insert_sequence = Sequence::new()
.sequence_type("DNA")
.sequence("NNNN")
.save(&conn);
let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str(), None);

let edge1 = Edge::create(
&conn,
template1_node_id,
2,
Strand::Forward,
insert_node_id,
0,
Strand::Forward,
0,
0,
);
let edge2 = Edge::create(
&conn,
insert_node_id,
4,
Strand::Forward,
template2_node_id,
3,
Strand::Forward,
0,
0,
);

let outgoing_boundaries = Edge::get_block_boundaries(Some(&vec![&edge1]), None, 10);
assert_eq!(outgoing_boundaries, vec![2]);
let incoming_boundaries = Edge::get_block_boundaries(None, Some(&vec![&edge2]), 10);
assert_eq!(incoming_boundaries, vec![3]);
}
}

0 comments on commit dd8450c

Please sign in to comment.