From 6116f134817242fa0956b1a9d684035bded4d79d Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 16 Oct 2024 14:47:48 -0400 Subject: [PATCH 1/6] Map ranges from one path to another --- src/lib.rs | 1 + src/models/path.rs | 1005 ++++++++++++++++++++++++++++++++++++++++++++ src/range.rs | 270 ++++++++++++ 3 files changed, 1276 insertions(+) create mode 100644 src/range.rs diff --git a/src/lib.rs b/src/lib.rs index 34332bf..42f8252 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,7 @@ pub mod imports; pub mod migrations; pub mod models; pub mod operation_management; +pub mod range; pub mod test_helpers; pub mod updates; diff --git a/src/models/path.rs b/src/models/path.rs index d1d5ade..d5eb1c9 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -13,6 +13,7 @@ use crate::models::{ sequence::Sequence, strand::Strand, }; +use crate::range::{Range, RangeMapping}; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] pub struct Path { @@ -295,6 +296,133 @@ impl Path { .collect(); tree } + + pub fn find_block_mappings( + conn: &Connection, + our_path: &Path, + other_path: &Path, + ) -> Vec { + // Given two paths, find the overlapping parts of common nodes/blocks and return a list af + // mappings from subranges of one path to corresponding shared subranges of the other path + let our_blocks = Path::blocks_for(conn, our_path); + let their_blocks = Path::blocks_for(conn, other_path); + + let our_node_ids = our_blocks + .iter() + .map(|block| block.node_id) + .collect::>(); + let their_node_ids = their_blocks + .iter() + .map(|block| block.node_id) + .collect::>(); + let common_node_ids = our_node_ids + .intersection(&their_node_ids) + .copied() + .collect::>(); + + let mut our_blocks_by_node_id = HashMap::new(); + for block in our_blocks + .iter() + .filter(|block| common_node_ids.contains(&block.node_id)) + { + our_blocks_by_node_id + .entry(block.node_id) + .or_insert(vec![]) + .push(block); + } + + let mut their_blocks_by_node_id = HashMap::new(); + for block in their_blocks + .iter() + .filter(|block| common_node_ids.contains(&block.node_id)) + { + their_blocks_by_node_id + .entry(block.node_id) + .or_insert(vec![]) + .push(block); + } + + let mut mappings = vec![]; + for node_id in common_node_ids { + let our_blocks = our_blocks_by_node_id.get(&node_id).unwrap(); + let our_sorted_blocks = our_blocks + .clone() + .into_iter() + .sorted_by(|a, b| a.sequence_start.cmp(&b.sequence_start)) + .collect::>(); + let their_blocks = their_blocks_by_node_id.get(&node_id).unwrap(); + let their_sorted_blocks = their_blocks + .clone() + .into_iter() + .sorted_by(|a, b| a.sequence_start.cmp(&b.sequence_start)) + .collect::>(); + + for our_block in our_sorted_blocks { + let mut their_block_index = 0; + + while their_block_index < their_sorted_blocks.len() { + let their_block = their_sorted_blocks[their_block_index]; + if their_block.sequence_end <= our_block.sequence_start { + // If their block is before ours, move along to the next one + their_block_index += 1; + } else { + let our_range = Range { + start: our_block.sequence_start, + end: our_block.sequence_end, + }; + let their_range = Range { + start: their_block.sequence_start, + end: their_block.sequence_end, + }; + + let common_ranges = our_range.overlap(&their_range); + if !common_ranges.is_empty() { + if common_ranges.len() > 1 { + panic!( + "Found more than one common range for blocks with node {}", + node_id + ); + } + + let common_range = &common_ranges[0]; + let our_start = our_block.path_start + + (common_range.start - our_block.sequence_start); + let our_end = our_block.path_start + + (common_range.end - our_block.sequence_start); + let their_start = their_block.path_start + + (common_range.start - their_block.sequence_start); + let their_end = their_block.path_start + + (common_range.end - their_block.sequence_start); + + let mapping = RangeMapping { + source_range: Range { + start: our_start, + end: our_end, + }, + target_range: Range { + start: their_start, + end: their_end, + }, + }; + mappings.push(mapping); + } + + if their_block.sequence_end < our_block.sequence_end { + // If their block ends before ours, move along to the next one + their_block_index += 1; + } else { + break; + } + } + } + } + } + + mappings + .into_iter() + .sorted_by(|a, b| a.source_range.start.cmp(&b.source_range.start)) + .collect::>() + } } #[cfg(test)] @@ -610,4 +738,881 @@ mod tests { assert_eq!(block4.path_end, 29); assert_eq!(block4.strand, Strand::Forward); } + + #[test] + fn test_full_block_mapping() { + /* + |--------| path: 1 sequence, (0, 8) + |ATCGATCG| + |--------| Same path: 1 sequence, (0, 8) + + Mapping: (0, 8) -> (0, 8) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let mappings = Path::find_block_mappings(conn, &path, &path); + assert_eq!(mappings.len(), 1); + let mapping = &mappings[0]; + assert_eq!(mapping.source_range, mapping.target_range); + assert_eq!(mapping.source_range.start, 0); + assert_eq!(mapping.source_range.end, 8); + assert_eq!(mapping.target_range.start, 0); + assert_eq!(mapping.target_range.end, 8); + } + + #[test] + fn test_no_block_mapping_overlap() { + /* + |--------| -> path 1 (one node) + |ATCGATCG| -> sequence + + |--------| -> path 2 (one node, totally different sequence) + |TTTTTTTT| -> other sequence + + Mappings: empty + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge3 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge4 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create(conn, "chr2", block_group.id, &[edge3.id, edge4.id]); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 0); + } + + #[test] + fn test_partial_overlap_block_mapping() { + /* + path 1 (one node/sequence): + |--------| + |ATCGATCG| -> sequence (0, 8) + + path 2: + |----| -> (0, 4) + |--------| -> (4, 12) + |ATCG| -> shared with path 1 + |TTTTTTTT| -> unrelated sequence + + Mapping: (0, 4) -> (0, 4) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge3 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge3.id, edge4.id, edge5.id], + ); + + assert_eq!(Path::sequence(conn, path2.clone()), "ATCGTTTTTTTT"); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 1); + let mapping = &mappings[0]; + assert_eq!(mapping.source_range, mapping.target_range); + assert_eq!(mapping.source_range.start, 0); + assert_eq!(mapping.source_range.end, 4); + assert_eq!(mapping.target_range.start, 0); + assert_eq!(mapping.target_range.end, 4); + } + + #[test] + fn test_insertion_block_mapping() { + /* + path 1 (one node/sequence): + |ATCGATCG| -> sequence (0, 8) + + path 2: Mimics a pure insertion + |ATCG| -> (0, 4) shared with first half of path 1 + |TTTTTTTT| -> (4, 12) unrelated sequence + |ATCG| -> (12, 16) shared with second half of path 1 + + Mappings: + (0, 4) -> (0, 4) + (4, 8) -> (12, 16) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + node1_id, + 4, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge5.id, edge2.id], + ); + + assert_eq!(Path::sequence(conn, path2.clone()), "ATCGTTTTTTTTATCG"); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 2); + let mapping1 = &mappings[0]; + assert_eq!(mapping1.source_range, mapping1.target_range); + assert_eq!(mapping1.source_range.start, 0); + assert_eq!(mapping1.source_range.end, 4); + assert_eq!(mapping1.target_range.start, 0); + assert_eq!(mapping1.target_range.end, 4); + + let mapping2 = &mappings[1]; + assert_eq!(mapping2.source_range.start, 4); + assert_eq!(mapping2.source_range.end, 8); + assert_eq!(mapping2.target_range.start, 12); + assert_eq!(mapping2.target_range.end, 16); + } + + #[test] + fn test_replacement_block_mapping() { + /* + path 1 (one node/sequence): + |ATCGATCG| -> sequence (0, 8) + + path 2: Mimics a replacement + |AT| -> (0, 2) shared with first two bp of path 1 + |TTTTTTTT| -> (2, 10) unrelated sequence + |CG| -> (10, 12) shared with last 2 bp of path 1 + + Mappings: + (0, 2) -> (0, 2) + (6, 8) -> (10, 12) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 2, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + node1_id, + 6, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge5.id, edge2.id], + ); + + assert_eq!(Path::sequence(conn, path2.clone()), "ATTTTTTTTTCG"); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 2); + let mapping1 = &mappings[0]; + assert_eq!(mapping1.source_range, mapping1.target_range); + assert_eq!(mapping1.source_range.start, 0); + assert_eq!(mapping1.source_range.end, 2); + assert_eq!(mapping1.target_range.start, 0); + assert_eq!(mapping1.target_range.end, 2); + + let mapping2 = &mappings[1]; + assert_eq!(mapping2.source_range.start, 6); + assert_eq!(mapping2.source_range.end, 8); + assert_eq!(mapping2.target_range.start, 10); + assert_eq!(mapping2.target_range.end, 12); + } + + #[test] + fn test_deletion_block_mapping() { + /* + path 1 (one node/sequence): + |ATCGATCG| -> sequence (0, 8) + + path 2: Mimics a pure deletion + |AT| -> (0, 2) shared with first two bp of path 1 + |CG| -> (2, 4) shared with last 2 bp of path 1 + + Mappings: + (0, 2) -> (0, 2) + (6, 8) -> (2, 4) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let edge4 = Edge::create( + conn, + node1_id, + 2, + Strand::Forward, + node1_id, + 6, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge2.id], + ); + + assert_eq!(Path::sequence(conn, path2.clone()), "ATCG"); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 2); + let mapping1 = &mappings[0]; + assert_eq!(mapping1.source_range, mapping1.target_range); + assert_eq!(mapping1.source_range.start, 0); + assert_eq!(mapping1.source_range.end, 2); + assert_eq!(mapping1.target_range.start, 0); + assert_eq!(mapping1.target_range.end, 2); + + let mapping2 = &mappings[1]; + assert_eq!(mapping2.source_range.start, 6); + assert_eq!(mapping2.source_range.end, 8); + assert_eq!(mapping2.target_range.start, 2); + assert_eq!(mapping2.target_range.end, 4); + } + + #[test] + fn test_two_block_insertion_mapping() { + /* + path 1 (two nodes/sequences): + |ATCGATCG| -> sequence (0, 8) + |TTTTTTTT| -> sequence (8, 16) + + path 2: Mimics a pure insertion in the middle of the two blocks + |ATCGATCG| -> sequence (0, 8) + |AAAAAAAA| -> sequence (8, 16) + |TTTTTTTT| -> sequence (16, 24) + + Mappings: + (0, 8) -> (0, 8) + (8, 16) -> (16, 24) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge3 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create( + conn, + "chr1", + block_group.id, + &[edge1.id, edge2.id, edge3.id], + ); + + let sequence3 = Sequence::new() + .sequence_type("DNA") + .sequence("AAAAAAAA") + .save(conn); + let node3_id = Node::create(conn, sequence3.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node3_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node3_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge5.id, edge3.id], + ); + + assert_eq!( + Path::sequence(conn, path2.clone()), + "ATCGATCGAAAAAAAATTTTTTTT" + ); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 2); + let mapping1 = &mappings[0]; + assert_eq!(mapping1.source_range, mapping1.target_range); + assert_eq!(mapping1.source_range.start, 0); + assert_eq!(mapping1.source_range.end, 8); + assert_eq!(mapping1.target_range.start, 0); + assert_eq!(mapping1.target_range.end, 8); + + let mapping2 = &mappings[1]; + assert_eq!(mapping2.source_range.start, 8); + assert_eq!(mapping2.source_range.end, 16); + assert_eq!(mapping2.target_range.start, 16); + assert_eq!(mapping2.target_range.end, 24); + } + + #[test] + fn test_two_block_replacement_mapping() { + /* + path 1 (two nodes/sequences): + |ATCGATCG| -> sequence (0, 8) + |TTTTTTTT| -> sequence (8, 16) + + path 2: Mimics a replacement across the two blocks + |ATCG| -> sequence (0, 4) + |AAAAAAAA| -> sequence (4, 12) + |TTTT| -> sequence (12, 16) + + Mappings: + (0, 4) -> (0, 4) + (12, 16) -> (12, 16) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge3 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create( + conn, + "chr1", + block_group.id, + &[edge1.id, edge2.id, edge3.id], + ); + + let sequence3 = Sequence::new() + .sequence_type("DNA") + .sequence("AAAAAAAA") + .save(conn); + let node3_id = Node::create(conn, sequence3.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node3_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node3_id, + 8, + Strand::Forward, + node2_id, + 4, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge5.id, edge3.id], + ); + + assert_eq!(Path::sequence(conn, path2.clone()), "ATCGAAAAAAAATTTT"); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 2); + let mapping1 = &mappings[0]; + assert_eq!(mapping1.source_range, mapping1.target_range); + assert_eq!(mapping1.source_range.start, 0); + assert_eq!(mapping1.source_range.end, 4); + assert_eq!(mapping1.target_range.start, 0); + assert_eq!(mapping1.target_range.end, 4); + + let mapping2 = &mappings[1]; + assert_eq!(mapping2.source_range.start, 12); + assert_eq!(mapping2.source_range.end, 16); + assert_eq!(mapping2.target_range.start, 12); + assert_eq!(mapping2.target_range.end, 16); + } + + #[test] + fn test_two_block_deletion_mapping() { + /* + path 1 (two nodes/sequences): + |ATCGATCG| -> sequence (0, 8) + |TTTTTTTT| -> sequence (8, 16) + + path 2: Mimics a deletion across the two blocks + |ATCG| -> sequence (0, 4) + |TTTT| -> sequence (4, 8) + + Mappings: + (0, 4) -> (0, 4) + (12, 16) -> (4, 8) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge3 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create( + conn, + "chr1", + block_group.id, + &[edge1.id, edge2.id, edge3.id], + ); + + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node2_id, + 4, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge3.id], + ); + + assert_eq!(Path::sequence(conn, path2.clone()), "ATCGTTTT"); + + let mappings = Path::find_block_mappings(conn, &path1, &path2); + assert_eq!(mappings.len(), 2); + let mapping1 = &mappings[0]; + assert_eq!(mapping1.source_range, mapping1.target_range); + assert_eq!(mapping1.source_range.start, 0); + assert_eq!(mapping1.source_range.end, 4); + assert_eq!(mapping1.target_range.start, 0); + assert_eq!(mapping1.target_range.end, 4); + + let mapping2 = &mappings[1]; + assert_eq!(mapping2.source_range.start, 12); + assert_eq!(mapping2.source_range.end, 16); + assert_eq!(mapping2.target_range.start, 4); + assert_eq!(mapping2.target_range.end, 8); + } } diff --git a/src/range.rs b/src/range.rs new file mode 100644 index 0000000..72c08d4 --- /dev/null +++ b/src/range.rs @@ -0,0 +1,270 @@ +use itertools::Itertools; +use std::cmp::{max, min}; + +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct Range { + pub start: i64, + pub end: i64, +} + +impl Range { + pub fn extend_to(&self, other: &Range) -> Range { + Range { + start: self.start, + end: other.end, + } + } + + pub fn left_adjoins(&self, other: &Range, modulus: Option) -> bool { + let mut other_start = other.start; + let mut self_end = self.end; + if let Some(modulus) = modulus { + other_start %= modulus; + self_end %= modulus; + } + + self_end == other_start + } + + pub fn is_wraparound(&self) -> bool { + self.start > self.end + } + + pub fn overlap(&self, other: &Range) -> Vec { + /* + Returns the overlapping ranges between two ranges. If there are multiple overlapping ranges, + such as can be the case when a range wraps the origin, multiple ranges are returned. If + there are no overlapping ranges, an empty list is returned. + + Examples: + + Overlap between two non-wraparound ranges + 6 19 + |------------| self + AAAAAAAAAAAAAAAAAAAAAAAA + |------------| other + 0 13 + |------| overlap + 6 13 + + Overlap with a wraparound range + 2 6 + >-| |----------------> self + AAAAAAAAAAAAAAAAAAAAAAAA + |---| other + 0 4 + |-| overlap + 0 2 + + Overlap with multiple wraparound ranges + 2 6 + >-| |----------------> self + AAAAAAAAAAAAAAAAAAAAAAAA + >---| |---------> other + 4 13 + >-| |---------> overlap + 2 13 + + Multiple Overlaps + 4 13 + >---| |---------> self + AAAAAAAAAAAAAAAAAAAAAAAA + |----------------| other + 2 19 + |-| |-----| overlaps + 2 4 13 19 + */ + + let start1 = self.start; + let end1 = self.end; + let start2 = other.start; + let end2 = other.end; + + let mut self_intervals = vec![]; + let mut other_intervals = vec![]; + + // split the ranges into pre-/post-origin segments + if self.is_wraparound() { + self_intervals.extend(vec![ + Range { + start: start1, + end: i64::MAX, + }, + Range { + start: 1, + end: end1, + }, + ]); + } else { + self_intervals.push(self.clone()); + } + + if other.is_wraparound() { + other_intervals.extend(vec![ + Range { + start: start2, + end: i64::MAX, + }, + Range { + start: 1, + end: end2, + }, + ]); + } else { + other_intervals.push(other.clone()); + } + + let overlaps = Range::find_pairwise_overlaps(self_intervals, other_intervals); + + if overlaps.len() > 1 { + Range::consolidate_overlaps_about_the_origin(overlaps) + } else { + overlaps + } + } + + fn find_pairwise_overlaps(intervals1: Vec, intervals2: Vec) -> Vec { + let mut overlaps = vec![]; + for interval1 in intervals1 { + for interval2 in &intervals2 { + if interval1.end > interval2.start && interval1.start <= interval2.end { + overlaps.push(Range { + start: max(interval1.start, interval2.start), + end: min(interval1.end, interval2.end), + }); + } + } + } + + overlaps + } + + fn consolidate_overlaps_about_the_origin(overlaps: Vec) -> Vec { + let mut sorted_overlaps = overlaps + .clone() + .into_iter() + .sorted_by(|a, b| a.start.cmp(&b.start)) + .collect::>(); + let first = sorted_overlaps.first().unwrap().clone(); + let last = sorted_overlaps.last().unwrap().clone(); + if first.start == 0 && last.end == i64::MAX { + sorted_overlaps.pop(); + sorted_overlaps.push(Range { + start: last.start, + end: first.end, + }); + } + + sorted_overlaps + } +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct RangeMapping { + pub source_range: Range, + pub target_range: Range, +} + +impl RangeMapping { + pub fn merge_continuous_mappings(mappings: Vec) -> Vec { + let mut grouped_mappings = vec![]; + let mut current_group = vec![]; + + for mapping in mappings { + if current_group.is_empty() { + current_group.push(mapping); + } else { + let last_mapping = current_group.last().unwrap(); + if last_mapping + .source_range + .left_adjoins(&mapping.source_range, None) + && last_mapping + .target_range + .left_adjoins(&mapping.target_range, None) + { + current_group.push(mapping); + } else { + grouped_mappings.push(current_group); + current_group = vec![mapping]; + } + } + } + + if !current_group.is_empty() { + grouped_mappings.push(current_group); + } + + let mut merged_mappings = vec![]; + for group in grouped_mappings { + let first = group.first().unwrap(); + let last = group.last().unwrap(); + merged_mappings.push(RangeMapping { + source_range: first.source_range.extend_to(&last.source_range), + target_range: first.target_range.extend_to(&last.target_range), + }); + } + + merged_mappings + } +} + +#[cfg(test)] +mod tests { + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + + #[test] + fn test_left_adjoins() { + let left_range = Range { start: 0, end: 2 }; + let middle_range = Range { start: 1, end: 3 }; + let right_range = Range { start: 2, end: 4 }; + + assert!(left_range.left_adjoins(&right_range, None)); + assert!(!left_range.left_adjoins(&middle_range, None)); + assert!(!middle_range.left_adjoins(&right_range, None)); + assert!(!right_range.left_adjoins(&left_range, None)); + assert!(!right_range.left_adjoins(&middle_range, None)); + assert!(!middle_range.left_adjoins(&left_range, None)); + + assert!(right_range.left_adjoins(&left_range, Some(4))); + assert!(left_range.left_adjoins(&right_range, Some(4))); + assert!(!left_range.left_adjoins(&middle_range, Some(4))); + assert!(!middle_range.left_adjoins(&right_range, Some(4))); + assert!(!right_range.left_adjoins(&middle_range, Some(4))); + assert!(!middle_range.left_adjoins(&left_range, Some(4))); + } + + #[test] + fn test_merge_continuous_ranges() { + let mappings = vec![ + RangeMapping { + source_range: Range { start: 0, end: 2 }, + target_range: Range { start: 2, end: 4 }, + }, + RangeMapping { + source_range: Range { start: 2, end: 5 }, + target_range: Range { start: 4, end: 7 }, + }, + RangeMapping { + source_range: Range { start: 7, end: 8 }, + target_range: Range { start: 9, end: 10 }, + }, + ]; + + let merged_mappings = RangeMapping::merge_continuous_mappings(mappings); + assert_eq!(merged_mappings.len(), 2); + assert_eq!( + merged_mappings, + vec![ + RangeMapping { + source_range: Range { start: 0, end: 5 }, + target_range: Range { start: 2, end: 7 }, + }, + RangeMapping { + source_range: Range { start: 7, end: 8 }, + target_range: Range { start: 9, end: 10 }, + }, + ] + ); + } +} From f4854d2c15bc6206c89f4bf5ed8aea7e3ee2d08d Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 16 Oct 2024 14:54:13 -0400 Subject: [PATCH 2/6] Fix comment indentation --- src/models/path.rs | 74 +++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/src/models/path.rs b/src/models/path.rs index d5eb1c9..10105bc 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -742,7 +742,7 @@ mod tests { #[test] fn test_full_block_mapping() { /* - |--------| path: 1 sequence, (0, 8) + |--------| path: 1 sequence, (0, 8) |ATCGATCG| |--------| Same path: 1 sequence, (0, 8) @@ -794,7 +794,7 @@ mod tests { #[test] fn test_no_block_mapping_overlap() { /* - |--------| -> path 1 (one node) + |--------| -> path 1 (one node) |ATCGATCG| -> sequence |--------| -> path 2 (one node, totally different sequence) @@ -872,14 +872,14 @@ mod tests { #[test] fn test_partial_overlap_block_mapping() { /* - path 1 (one node/sequence): - |--------| + path 1 (one node/sequence): + |--------| |ATCGATCG| -> sequence (0, 8) - path 2: - |----| -> (0, 4) + path 2: + |----| -> (0, 4) |--------| -> (4, 12) - |ATCG| -> shared with path 1 + |ATCG| -> shared with path 1 |TTTTTTTT| -> unrelated sequence Mapping: (0, 4) -> (0, 4) @@ -978,17 +978,17 @@ mod tests { #[test] fn test_insertion_block_mapping() { /* - path 1 (one node/sequence): + path 1 (one node/sequence): |ATCGATCG| -> sequence (0, 8) - path 2: Mimics a pure insertion - |ATCG| -> (0, 4) shared with first half of path 1 + path 2: Mimics a pure insertion + |ATCG| -> (0, 4) shared with first half of path 1 |TTTTTTTT| -> (4, 12) unrelated sequence - |ATCG| -> (12, 16) shared with second half of path 1 + |ATCG| -> (12, 16) shared with second half of path 1 Mappings: - (0, 4) -> (0, 4) - (4, 8) -> (12, 16) + (0, 4) -> (0, 4) + (4, 8) -> (12, 16) */ let conn = &mut get_connection(None); Collection::create(conn, "test collection"); @@ -1079,17 +1079,17 @@ mod tests { #[test] fn test_replacement_block_mapping() { /* - path 1 (one node/sequence): + path 1 (one node/sequence): |ATCGATCG| -> sequence (0, 8) - path 2: Mimics a replacement - |AT| -> (0, 2) shared with first two bp of path 1 + path 2: Mimics a replacement + |AT| -> (0, 2) shared with first two bp of path 1 |TTTTTTTT| -> (2, 10) unrelated sequence - |CG| -> (10, 12) shared with last 2 bp of path 1 + |CG| -> (10, 12) shared with last 2 bp of path 1 Mappings: - (0, 2) -> (0, 2) - (6, 8) -> (10, 12) + (0, 2) -> (0, 2) + (6, 8) -> (10, 12) */ let conn = &mut get_connection(None); Collection::create(conn, "test collection"); @@ -1180,16 +1180,16 @@ mod tests { #[test] fn test_deletion_block_mapping() { /* - path 1 (one node/sequence): + path 1 (one node/sequence): |ATCGATCG| -> sequence (0, 8) - path 2: Mimics a pure deletion - |AT| -> (0, 2) shared with first two bp of path 1 - |CG| -> (2, 4) shared with last 2 bp of path 1 + path 2: Mimics a pure deletion + |AT| -> (0, 2) shared with first two bp of path 1 + |CG| -> (2, 4) shared with last 2 bp of path 1 Mappings: - (0, 2) -> (0, 2) - (6, 8) -> (2, 4) + (0, 2) -> (0, 2) + (6, 8) -> (2, 4) */ let conn = &mut get_connection(None); Collection::create(conn, "test collection"); @@ -1264,18 +1264,18 @@ mod tests { #[test] fn test_two_block_insertion_mapping() { /* - path 1 (two nodes/sequences): + path 1 (two nodes/sequences): |ATCGATCG| -> sequence (0, 8) |TTTTTTTT| -> sequence (8, 16) - path 2: Mimics a pure insertion in the middle of the two blocks + path 2: Mimics a pure insertion in the middle of the two blocks |ATCGATCG| -> sequence (0, 8) |AAAAAAAA| -> sequence (8, 16) |TTTTTTTT| -> sequence (16, 24) Mappings: - (0, 8) -> (0, 8) - (8, 16) -> (16, 24) + (0, 8) -> (0, 8) + (8, 16) -> (16, 24) */ let conn = &mut get_connection(None); Collection::create(conn, "test collection"); @@ -1390,18 +1390,18 @@ mod tests { #[test] fn test_two_block_replacement_mapping() { /* - path 1 (two nodes/sequences): + path 1 (two nodes/sequences): |ATCGATCG| -> sequence (0, 8) |TTTTTTTT| -> sequence (8, 16) - path 2: Mimics a replacement across the two blocks + path 2: Mimics a replacement across the two blocks |ATCG| -> sequence (0, 4) |AAAAAAAA| -> sequence (4, 12) |TTTT| -> sequence (12, 16) Mappings: - (0, 4) -> (0, 4) - (12, 16) -> (12, 16) + (0, 4) -> (0, 4) + (12, 16) -> (12, 16) */ let conn = &mut get_connection(None); Collection::create(conn, "test collection"); @@ -1513,17 +1513,17 @@ mod tests { #[test] fn test_two_block_deletion_mapping() { /* - path 1 (two nodes/sequences): + path 1 (two nodes/sequences): |ATCGATCG| -> sequence (0, 8) |TTTTTTTT| -> sequence (8, 16) - path 2: Mimics a deletion across the two blocks + path 2: Mimics a deletion across the two blocks |ATCG| -> sequence (0, 4) |TTTT| -> sequence (4, 8) Mappings: - (0, 4) -> (0, 4) - (12, 16) -> (4, 8) + (0, 4) -> (0, 4) + (12, 16) -> (4, 8) */ let conn = &mut get_connection(None); Collection::create(conn, "test collection"); From e0cf1110116484b918afaab15b415a4ccd28a7c1 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 16 Oct 2024 16:15:10 -0400 Subject: [PATCH 3/6] Map annotations across paths, use self more --- src/exports/gfa.rs | 2 +- src/imports/fasta.rs | 4 +- src/imports/gfa.rs | 10 +-- src/models/block_group.rs | 44 +++++------ src/models/path.rs | 159 ++++++++++++++++++++++++++++---------- src/range.rs | 31 ++++++++ 6 files changed, 180 insertions(+), 70 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 895b334..e792167 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -361,7 +361,7 @@ mod tests { let paths = Path::get_paths_for_collection(&conn, "test collection 2"); assert_eq!(paths.len(), 1); - assert_eq!(Path::sequence(&conn, paths[0].clone()), "AAAATTTTGGGGCCCC"); + assert_eq!(paths[0].sequence(&conn), "AAAATTTTGGGGCCCC"); } #[test] diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index f815cef..f20ecf8 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -151,7 +151,7 @@ mod tests { let path = Path::get(&conn, 1); assert_eq!( - Path::sequence(&conn, path), + path.sequence(&conn), "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string() ); } @@ -180,7 +180,7 @@ mod tests { let path = Path::get(&conn, 1); assert_eq!( - Path::sequence(&conn, path), + path.sequence(&conn), "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string() ); } diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index e0c3f50..22a72a4 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -243,7 +243,7 @@ mod tests { )[0] .clone(); - let result = Path::sequence(conn, path); + let result = path.sequence(conn); assert_eq!(result, "ATCGATCGATCGATCGATCGGGAACACACAGAGA"); let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i64; @@ -288,7 +288,7 @@ mod tests { )[0] .clone(); - let result = Path::sequence(conn, path); + let result = path.sequence(conn); assert_eq!(result, "ACCTACAAATTCAAAC"); let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i64; @@ -314,7 +314,7 @@ mod tests { )[0] .clone(); - let result = Path::sequence(conn, path); + let result = path.sequence(conn); assert_eq!(result, "TATGCCAGCTGCGAATA"); let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i64; @@ -343,7 +343,7 @@ mod tests { )[0] .clone(); - let result = Path::sequence(conn, path); + let result = path.sequence(conn); let big_part = "TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"; let expected_sequence_parts = vec![ "T", @@ -446,7 +446,7 @@ mod tests { )[0] .clone(); - let result = Path::sequence(conn, path); + let result = path.sequence(conn); assert_eq!(result, "AA"); let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); diff --git a/src/models/block_group.rs b/src/models/block_group.rs index f4cd93d..fdb74c8 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -76,7 +76,7 @@ impl PathCache<'_> { .clone(); path_cache.cache.insert(path_key, new_path.clone()); - let tree = Path::intervaltree_for(path_cache.conn, &new_path); + let tree = new_path.intervaltree(path_cache.conn); path_cache.intervaltree_cache.insert(new_path.clone(), tree); new_path } @@ -721,7 +721,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -760,7 +760,7 @@ mod tests { phased: 0, }; // take out an entire block. - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( @@ -803,7 +803,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -845,7 +845,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -887,7 +887,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -929,7 +929,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -971,7 +971,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1013,7 +1013,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1055,7 +1055,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1097,7 +1097,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1142,7 +1142,7 @@ mod tests { }; // take out an entire block. - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( @@ -1183,7 +1183,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1195,7 +1195,7 @@ mod tests { ]) ); - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1237,7 +1237,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1280,7 +1280,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1322,7 +1322,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1364,7 +1364,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1406,7 +1406,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1448,7 +1448,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1490,7 +1490,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); @@ -1532,7 +1532,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); diff --git a/src/models/path.rs b/src/models/path.rs index 10105bc..8167e91 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,3 +1,4 @@ +use core::ops::Range as RustRange; use std::collections::{HashMap, HashSet}; use intervaltree::IntervalTree; @@ -73,6 +74,13 @@ pub struct PathBlock { pub strand: Strand, } +#[derive(Clone, Debug)] +pub struct Annotation { + pub name: String, + pub start: i64, + pub end: i64, +} + impl Path { pub fn create(conn: &Connection, name: &str, block_group_id: i64, edge_ids: &[i64]) -> Path { // TODO: Should we do something if edge_ids don't match here? Suppose we have a path @@ -166,8 +174,8 @@ impl Path { Path::get_paths(conn, query, vec![Value::from(collection_name.to_string())]) } - pub fn sequence(conn: &Connection, path: Path) -> String { - let blocks = Path::blocks_for(conn, &path); + pub fn sequence(&self, conn: &Connection) -> String { + let blocks = self.blocks(conn); blocks .into_iter() .map(|block| block.block_sequence) @@ -176,8 +184,8 @@ impl Path { } pub fn edge_pairs_to_block( + &self, block_id: i64, - path: &Path, into: Edge, out_of: Edge, sequences_by_node_id: &HashMap, @@ -186,7 +194,7 @@ impl Path { if into.target_node_id != out_of.source_node_id { panic!( "Consecutive edges in path {0} don't share the same sequence", - path.id + self.id ); } @@ -203,7 +211,7 @@ impl Path { } else { panic!( "Edge pair with target_strand/source_strand mismatch for path {}", - path.id + self.id ); } @@ -225,8 +233,8 @@ impl Path { } } - pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { - let edges = PathEdge::edges_for_path(conn, path.id); + pub fn blocks(&self, conn: &Connection) -> Vec { + let edges = PathEdge::edges_for_path(conn, self.id); let mut sequence_node_ids = HashSet::new(); for edge in &edges { if edge.source_node_id != PATH_START_NODE_ID { @@ -259,9 +267,8 @@ impl Path { }); for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { - let block = Path::edge_pairs_to_block( + let block = self.edge_pairs_to_block( index as i64, - path, into, out_of, &sequences_by_node_id, @@ -288,8 +295,8 @@ impl Path { blocks } - pub fn intervaltree_for(conn: &Connection, path: &Path) -> IntervalTree { - let blocks = Path::blocks_for(conn, path); + pub fn intervaltree(&self, conn: &Connection) -> IntervalTree { + let blocks = self.blocks(conn); let tree: IntervalTree = blocks .into_iter() .map(|block| (block.path_start..block.path_end, block)) @@ -297,15 +304,11 @@ impl Path { tree } - pub fn find_block_mappings( - conn: &Connection, - our_path: &Path, - other_path: &Path, - ) -> Vec { + pub fn find_block_mappings(&self, conn: &Connection, other_path: &Path) -> Vec { // Given two paths, find the overlapping parts of common nodes/blocks and return a list af // mappings from subranges of one path to corresponding shared subranges of the other path - let our_blocks = Path::blocks_for(conn, our_path); - let their_blocks = Path::blocks_for(conn, other_path); + let our_blocks = self.blocks(conn); + let their_blocks = other_path.blocks(conn); let our_node_ids = our_blocks .iter() @@ -423,6 +426,85 @@ impl Path { .sorted_by(|a, b| a.source_range.start.cmp(&b.source_range.start)) .collect::>() } + + fn propagate_annotation( + annotation: Annotation, + mapping_tree: &IntervalTree, + sequence_length: i64, + ) -> Option { + // TODO: Add support for different propagation strategies + // TODO: Handle circular contigs + let start = annotation.start; + let end = annotation.end; + let mappings: Vec = mapping_tree + .query(RustRange { start, end }) + .map(|x| x.value.clone()) + .collect(); + if mappings.is_empty() { + return None; + } + + let sorted_mappings: Vec = mappings + .into_iter() + .sorted_by(|a, b| a.source_range.start.cmp(&b.source_range.start)) + .collect(); + let first_mapping = sorted_mappings.first().unwrap(); + let last_mapping = sorted_mappings.last().unwrap(); + let translated_start = if first_mapping.source_range.contains(start) { + first_mapping.source_range.translate_index( + start, + &first_mapping.target_range, + sequence_length, + false, + ) + } else { + first_mapping.target_range.start + }; + + let translated_end = if last_mapping.source_range.contains(end) { + last_mapping.source_range.translate_index( + end, + &last_mapping.target_range, + sequence_length, + false, + ) + } else { + last_mapping.target_range.end + }; + + Some(Annotation { + name: annotation.name, + start: translated_start, + end: translated_end, + }) + } + + pub fn propagate_annotations_to_path( + &self, + conn: &Connection, + path: &Path, + annotations: Vec, + ) -> Vec { + let mappings = self.find_block_mappings(conn, path); + let sequence_length = path.sequence(conn).len(); + let mapping_tree: IntervalTree = mappings + .into_iter() + .map(|mapping| { + ( + mapping.source_range.start..mapping.source_range.end, + mapping, + ) + }) + .collect(); + + annotations + .into_iter() + .filter_map(|annotation| { + Path::propagate_annotation(annotation, &mapping_tree, sequence_length as i64) + }) + .clone() + .collect() + } } #[cfg(test)] @@ -520,7 +602,7 @@ mod tests { block_group.id, &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - assert_eq!(Path::sequence(conn, path), "ATCGATCGAAAAAAACCCCCCCGGGGGGG"); + assert_eq!(path.sequence(conn), "ATCGATCGAAAAAAACCCCCCCGGGGGGG"); } #[test] @@ -610,7 +692,7 @@ mod tests { block_group.id, &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - assert_eq!(Path::sequence(conn, path), "CCCCCCCGGGGGGGTTTTTTTCGATCGAT"); + assert_eq!(path.sequence(conn), "CCCCCCCGGGGGGGTTTTTTTCGATCGAT"); } #[test] @@ -707,7 +789,7 @@ mod tests { block_group.id, &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - let tree = Path::intervaltree_for(conn, &path); + let tree = path.intervaltree(conn); let blocks1: Vec = tree.query_point(2).map(|x| x.value.clone()).collect(); assert_eq!(blocks1.len(), 1); let block1 = &blocks1[0]; @@ -781,7 +863,7 @@ mod tests { let path = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); - let mappings = Path::find_block_mappings(conn, &path, &path); + let mappings = path.find_block_mappings(conn, &path); assert_eq!(mappings.len(), 1); let mapping = &mappings[0]; assert_eq!(mapping.source_range, mapping.target_range); @@ -865,7 +947,7 @@ mod tests { let path2 = Path::create(conn, "chr2", block_group.id, &[edge3.id, edge4.id]); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 0); } @@ -963,9 +1045,9 @@ mod tests { &[edge3.id, edge4.id, edge5.id], ); - assert_eq!(Path::sequence(conn, path2.clone()), "ATCGTTTTTTTT"); + assert_eq!(path2.sequence(conn), "ATCGTTTTTTTT"); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 1); let mapping = &mappings[0]; assert_eq!(mapping.source_range, mapping.target_range); @@ -1058,9 +1140,9 @@ mod tests { &[edge1.id, edge4.id, edge5.id, edge2.id], ); - assert_eq!(Path::sequence(conn, path2.clone()), "ATCGTTTTTTTTATCG"); + assert_eq!(path2.sequence(conn), "ATCGTTTTTTTTATCG"); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 2); let mapping1 = &mappings[0]; assert_eq!(mapping1.source_range, mapping1.target_range); @@ -1159,9 +1241,9 @@ mod tests { &[edge1.id, edge4.id, edge5.id, edge2.id], ); - assert_eq!(Path::sequence(conn, path2.clone()), "ATTTTTTTTTCG"); + assert_eq!(path2.sequence(conn), "ATTTTTTTTTCG"); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 2); let mapping1 = &mappings[0]; assert_eq!(mapping1.source_range, mapping1.target_range); @@ -1243,9 +1325,9 @@ mod tests { &[edge1.id, edge4.id, edge2.id], ); - assert_eq!(Path::sequence(conn, path2.clone()), "ATCG"); + assert_eq!(path2.sequence(conn), "ATCG"); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 2); let mapping1 = &mappings[0]; assert_eq!(mapping1.source_range, mapping1.target_range); @@ -1366,12 +1448,9 @@ mod tests { &[edge1.id, edge4.id, edge5.id, edge3.id], ); - assert_eq!( - Path::sequence(conn, path2.clone()), - "ATCGATCGAAAAAAAATTTTTTTT" - ); + assert_eq!(path2.sequence(conn), "ATCGATCGAAAAAAAATTTTTTTT"); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 2); let mapping1 = &mappings[0]; assert_eq!(mapping1.source_range, mapping1.target_range); @@ -1492,9 +1571,9 @@ mod tests { &[edge1.id, edge4.id, edge5.id, edge3.id], ); - assert_eq!(Path::sequence(conn, path2.clone()), "ATCGAAAAAAAATTTT"); + assert_eq!(path2.sequence(conn), "ATCGAAAAAAAATTTT"); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 2); let mapping1 = &mappings[0]; assert_eq!(mapping1.source_range, mapping1.target_range); @@ -1598,9 +1677,9 @@ mod tests { &[edge1.id, edge4.id, edge3.id], ); - assert_eq!(Path::sequence(conn, path2.clone()), "ATCGTTTT"); + assert_eq!(path2.sequence(conn), "ATCGTTTT"); - let mappings = Path::find_block_mappings(conn, &path1, &path2); + let mappings = path1.find_block_mappings(conn, &path2); assert_eq!(mappings.len(), 2); let mapping1 = &mappings[0]; assert_eq!(mapping1.source_range, mapping1.target_range); diff --git a/src/range.rs b/src/range.rs index 72c08d4..b94d53f 100644 --- a/src/range.rs +++ b/src/range.rs @@ -157,6 +157,37 @@ impl Range { sorted_overlaps } + + pub fn contains(&self, index: i64) -> bool { + if self.is_wraparound() { + index >= self.start || index <= self.end + } else { + index >= self.start && index <= self.end + } + } + + pub fn circular_mod(index: i64, sequence_length: i64, is_circular_contig: bool) -> i64 { + if is_circular_contig { + index % sequence_length + } else { + index + } + } + + pub fn translate_index( + &self, + index: i64, + range2: &Range, + sequence_length: i64, + is_circular_contig: bool, + ) -> i64 { + if !self.contains(index) { + panic!("Index {} is not contained in range {:?}", index, self); + } + + let offset = index - self.start; + Range::circular_mod(range2.start + offset, sequence_length, is_circular_contig) + } } #[derive(Clone, Debug, Eq, Hash, PartialEq)] From 9b9c9115760546d1c054cf1a47cca6ba1af7666c Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 16 Oct 2024 17:38:54 -0400 Subject: [PATCH 4/6] Annotation tests --- src/models/path.rs | 682 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 681 insertions(+), 1 deletion(-) diff --git a/src/models/path.rs b/src/models/path.rs index 8167e91..d9bb0e8 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -479,7 +479,7 @@ impl Path { }) } - pub fn propagate_annotations_to_path( + pub fn propagate_annotations( &self, conn: &Connection, path: &Path, @@ -1694,4 +1694,684 @@ mod tests { assert_eq!(mapping2.target_range.start, 4); assert_eq!(mapping2.target_range.end, 8); } + + #[test] + fn test_annotation_propagation_full_overlap() { + /* + |--------| path: 1 sequence, (0, 8) + |ATCGATCG| + |--------| Same path: 1 sequence, (0, 8) + + Mapping: (0, 8) -> (0, 8) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let annotation = Annotation { + name: "foo".to_string(), + start: 0, + end: 8, + }; + let annotations = path.propagate_annotations(conn, &path, vec![annotation]); + assert_eq!(annotations.len(), 1); + let result_annotation = &annotations[0]; + assert_eq!(result_annotation.name, "foo"); + assert_eq!(result_annotation.start, 0); + assert_eq!(result_annotation.end, 8); + } + + #[test] + fn test_propagate_annotations_no_overlap() { + /* + |--------| -> path 1 (one node) + |ATCGATCG| -> sequence + + |--------| -> path 2 (one node, totally different sequence) + |TTTTTTTT| -> other sequence + + Mappings: empty + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge3 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge4 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create(conn, "chr2", block_group.id, &[edge3.id, edge4.id]); + + let annotation = Annotation { + name: "foo".to_string(), + start: 0, + end: 8, + }; + let annotations = path1.propagate_annotations(conn, &path2, vec![annotation]); + assert_eq!(annotations.len(), 0); + } + + #[test] + fn test_propagate_annotations_partial_overlap() { + /* + path 1 (one node/sequence): + |--------| + |ATCGATCG| -> sequence (0, 8) + + path 2: + |----| -> (0, 4) + |--------| -> (4, 12) + |ATCG| -> shared with path 1 + |TTTTTTTT| -> unrelated sequence + + Mapping: (0, 4) -> (0, 4) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge3 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge3.id, edge4.id, edge5.id], + ); + + assert_eq!(path2.sequence(conn), "ATCGTTTTTTTT"); + + let annotation = Annotation { + name: "foo".to_string(), + start: 0, + end: 8, + }; + let annotations = path1.propagate_annotations(conn, &path2, vec![annotation]); + assert_eq!(annotations.len(), 1); + let result_annotation = &annotations[0]; + assert_eq!(result_annotation.name, "foo"); + assert_eq!(result_annotation.start, 0); + assert_eq!(result_annotation.end, 4); + } + + #[test] + fn test_propagate_annotations_with_insertion() { + /* + path 1 (one node/sequence): + |ATCGATCG| -> sequence (0, 8) + + path 2: Mimics a pure insertion + |ATCG| -> (0, 4) shared with first half of path 1 + |TTTTTTTT| -> (4, 12) unrelated sequence + |ATCG| -> (12, 16) shared with second half of path 1 + + Mappings: + (0, 4) -> (0, 4) + (4, 8) -> (12, 16) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + node1_id, + 4, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge5.id, edge2.id], + ); + + assert_eq!(path2.sequence(conn), "ATCGTTTTTTTTATCG"); + + let annotation = Annotation { + name: "foo".to_string(), + start: 0, + end: 8, + }; + + let annotations = path1.propagate_annotations(conn, &path2, vec![annotation]); + assert_eq!(annotations.len(), 1); + + // Under the default propagation strategy, the annotation is expanded to cover anything in + // between parts it covers + let result_annotation = &annotations[0]; + assert_eq!(result_annotation.name, "foo"); + assert_eq!(result_annotation.start, 0); + assert_eq!(result_annotation.end, 16); + } + + #[test] + fn test_propagate_annotations_with_replacement() { + /* + path 1 (one node/sequence): + |ATCGATCG| -> sequence (0, 8) + + path 2: Mimics a replacement + |AT| -> (0, 2) shared with first two bp of path 1 + |TTTTTTTT| -> (2, 10) unrelated sequence + |CG| -> (10, 12) shared with last 2 bp of path 1 + + Mappings: + (0, 2) -> (0, 2) + (6, 8) -> (10, 12) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create(conn, "chr1", block_group.id, &[edge1.id, edge2.id]); + + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 2, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + node1_id, + 6, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge5.id, edge2.id], + ); + + assert_eq!(path2.sequence(conn), "ATTTTTTTTTCG"); + + let annotation = Annotation { + name: "foo".to_string(), + start: 0, + end: 4, + }; + + let annotations = path1.propagate_annotations(conn, &path2, vec![annotation]); + assert_eq!(annotations.len(), 1); + + // Under the default propagation strategy, the annotation is truncated + let result_annotation = &annotations[0]; + assert_eq!(result_annotation.name, "foo"); + assert_eq!(result_annotation.start, 0); + assert_eq!(result_annotation.end, 2); + } + + #[test] + fn test_propagate_annotations_with_insertion_across_two_blocks() { + /* + path 1 (two nodes/sequences): + |ATCGATCG| -> sequence (0, 8) + |TTTTTTTT| -> sequence (8, 16) + + path 2: Mimics a pure insertion in the middle of the two blocks + |ATCGATCG| -> sequence (0, 8) + |AAAAAAAA| -> sequence (8, 16) + |TTTTTTTT| -> sequence (16, 24) + + Mappings: + (0, 8) -> (0, 8) + (8, 16) -> (16, 24) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge3 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create( + conn, + "chr1", + block_group.id, + &[edge1.id, edge2.id, edge3.id], + ); + + let sequence3 = Sequence::new() + .sequence_type("DNA") + .sequence("AAAAAAAA") + .save(conn); + let node3_id = Node::create(conn, sequence3.hash.as_str(), None); + let edge4 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node3_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge5 = Edge::create( + conn, + node3_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge5.id, edge3.id], + ); + + assert_eq!(path2.sequence(conn), "ATCGATCGAAAAAAAATTTTTTTT"); + + let annotation = Annotation { + name: "foo".to_string(), + start: 0, + end: 16, + }; + + let annotations = path1.propagate_annotations(conn, &path2, vec![annotation]); + assert_eq!(annotations.len(), 1); + + // Under the default propagation strategy, the annotation is extended across the inserted + // region + let result_annotation = &annotations[0]; + assert_eq!(result_annotation.name, "foo"); + assert_eq!(result_annotation.start, 0); + assert_eq!(result_annotation.end, 24); + } + + #[test] + fn test_propagate_annotations_with_deletion_across_two_blocks() { + /* + path 1 (two nodes/sequences): + |ATCGATCG| -> sequence (0, 8) + |TTTTTTTT| -> sequence (8, 16) + + path 2: Mimics a deletion across the two blocks + |ATCG| -> sequence (0, 4) + |TTTT| -> sequence (4, 8) + + Mappings: + (0, 4) -> (0, 4) + (12, 16) -> (4, 8) + */ + let conn = &mut get_connection(None); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCG") + .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str(), None); + let sequence2 = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTT") + .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str(), None); + let edge1 = Edge::create( + conn, + PATH_START_NODE_ID, + -1, + Strand::Forward, + node1_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + node1_id, + 8, + Strand::Forward, + node2_id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge3 = Edge::create( + conn, + node2_id, + 8, + Strand::Forward, + PATH_END_NODE_ID, + -1, + Strand::Forward, + 0, + 0, + ); + + let path1 = Path::create( + conn, + "chr1", + block_group.id, + &[edge1.id, edge2.id, edge3.id], + ); + + let edge4 = Edge::create( + conn, + node1_id, + 4, + Strand::Forward, + node2_id, + 4, + Strand::Forward, + 0, + 0, + ); + + let path2 = Path::create( + conn, + "chr2", + block_group.id, + &[edge1.id, edge4.id, edge3.id], + ); + + assert_eq!(path2.sequence(conn), "ATCGTTTT"); + + let annotation = Annotation { + name: "foo".to_string(), + start: 0, + end: 12, + }; + + let annotations = path1.propagate_annotations(conn, &path2, vec![annotation]); + assert_eq!(annotations.len(), 1); + + // Under the default propagation strategy, the annotation is truncated + let result_annotation = &annotations[0]; + assert_eq!(result_annotation.name, "foo"); + assert_eq!(result_annotation.start, 0); + assert_eq!(result_annotation.end, 4); + } } From 9e1cb1b50a2e6187e4cdcbe5d1c0ad9cb4251b8b Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 17 Oct 2024 17:25:35 -0400 Subject: [PATCH 5/6] Some comments and new test --- src/models/path.rs | 15 +++++++++++++++ src/range.rs | 19 ++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/models/path.rs b/src/models/path.rs index d9bb0e8..1bb7dee 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -432,6 +432,21 @@ impl Path { mapping_tree: &IntervalTree, sequence_length: i64, ) -> Option { + /* + This method contains the core logic for propagating an annotation from one path to another. + The core rules are: + + 1. If the annotation can be fully propagated to a matching subregion of the other path, + we propagate it + + 2. If only part of the annotation can be propagated to a partial subregion of the other + path, we propagate just that part and truncate the rest + + 3. If the first and last parts of the annotation can be propagated to subregions of the + other path (but not one or more parts of the middle of the annotation), we propagate the + entire annotation, including across the parts that don't match those of this path + */ + // TODO: Add support for different propagation strategies // TODO: Handle circular contigs let start = annotation.start; diff --git a/src/range.rs b/src/range.rs index b94d53f..f9ff51f 100644 --- a/src/range.rs +++ b/src/range.rs @@ -15,6 +15,7 @@ impl Range { } } + // Returns true if this range is left-adjacent to the other range pub fn left_adjoins(&self, other: &Range, modulus: Option) -> bool { let mut other_start = other.start; let mut self_end = self.end; @@ -139,6 +140,7 @@ impl Range { overlaps } + // Consolidate the first and last overlaps if they lie on either side of the origin. fn consolidate_overlaps_about_the_origin(overlaps: Vec) -> Vec { let mut sorted_overlaps = overlaps .clone() @@ -197,7 +199,7 @@ pub struct RangeMapping { } impl RangeMapping { - pub fn merge_continuous_mappings(mappings: Vec) -> Vec { + pub fn merge_contiguous_mappings(mappings: Vec) -> Vec { let mut grouped_mappings = vec![]; let mut current_group = vec![]; @@ -266,7 +268,18 @@ mod tests { } #[test] - fn test_merge_continuous_ranges() { + fn test_overlap() { + let range1 = Range { start: 0, end: 12 }; + let range2 = Range { start: 4, end: 8 }; + let range3 = Range { start: 10, end: 16 }; + + assert_eq!(range1.overlap(&range2), vec![Range { start: 4, end: 8 }]); + assert_eq!(range1.overlap(&range3), vec![Range { start: 10, end: 12 }]); + assert_eq!(range2.overlap(&range3), vec![]); + } + + #[test] + fn test_merge_contiguous_ranges() { let mappings = vec![ RangeMapping { source_range: Range { start: 0, end: 2 }, @@ -282,7 +295,7 @@ mod tests { }, ]; - let merged_mappings = RangeMapping::merge_continuous_mappings(mappings); + let merged_mappings = RangeMapping::merge_contiguous_mappings(mappings); assert_eq!(merged_mappings.len(), 2); assert_eq!( merged_mappings, From 063364f572b03a2b0b655b81d1984108cba6c5a1 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 30 Oct 2024 12:50:00 -0400 Subject: [PATCH 6/6] Fixes after rebase; address comment --- src/exports/gfa.rs | 2 +- src/models/edge.rs | 2 +- src/models/path.rs | 12 ++++++++---- src/range.rs | 10 +++++++--- src/updates/library.rs | 2 +- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index e792167..9dd8f54 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -477,7 +477,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let edges = BlockGroupEdge::edges_for_block_group(&conn, block_group_id); diff --git a/src/models/edge.rs b/src/models/edge.rs index 20152ce..45ad238 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -815,7 +815,7 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); + let tree = path.intervaltree(&conn); BlockGroup::insert_change(&conn, &change, &tree); let mut edges = BlockGroupEdge::edges_for_block_group(&conn, block_group_id); diff --git a/src/models/path.rs b/src/models/path.rs index 1bb7dee..ccb4b38 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -473,7 +473,7 @@ impl Path { false, ) } else { - first_mapping.target_range.start + Ok(first_mapping.target_range.start) }; let translated_end = if last_mapping.source_range.contains(end) { @@ -484,13 +484,17 @@ impl Path { false, ) } else { - last_mapping.target_range.end + Ok(last_mapping.target_range.end) }; + if translated_start.is_err() || translated_end.is_err() { + return None; + } + Some(Annotation { name: annotation.name, - start: translated_start, - end: translated_end, + start: translated_start.expect("Failed to translate start"), + end: translated_end.expect("Failed to translate end"), }) } diff --git a/src/range.rs b/src/range.rs index f9ff51f..b1368de 100644 --- a/src/range.rs +++ b/src/range.rs @@ -182,13 +182,17 @@ impl Range { range2: &Range, sequence_length: i64, is_circular_contig: bool, - ) -> i64 { + ) -> Result { if !self.contains(index) { - panic!("Index {} is not contained in range {:?}", index, self); + return Err("Index is not contained in range"); } let offset = index - self.start; - Range::circular_mod(range2.start + offset, sequence_length, is_circular_contig) + Ok(Range::circular_mod( + range2.start + offset, + sequence_length, + is_circular_contig, + )) } } diff --git a/src/updates/library.rs b/src/updates/library.rs index 3e0e114..1aff439 100644 --- a/src/updates/library.rs +++ b/src/updates/library.rs @@ -106,7 +106,7 @@ pub fn update_with_library( parts_list.push(parts_by_index.get(&index).unwrap()); } - let path_intervaltree = Path::intervaltree_for(conn, &path); + let path_intervaltree = path.intervaltree(conn); let start_blocks: Vec<_> = path_intervaltree .query_point(start_coordinate) .map(|x| &x.value)