From 447ceeffb500d8294864134707450fcd1046e4d4 Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 20 Aug 2024 12:56:49 -0400 Subject: [PATCH] Add method for returning intervaltree mapping path ranges to blocks --- Cargo.lock | 10 +++++ Cargo.toml | 1 + src/models/path.rs | 108 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index be95ac1..12cb1b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -591,6 +591,7 @@ dependencies = [ "bio", "clap", "include_dir", + "intervaltree", "itertools", "noodles", "petgraph", @@ -679,6 +680,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "intervaltree" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "270bc34e57047cab801a8c871c124d9dc7132f6473c6401f645524f4e6edd111" +dependencies = [ + "smallvec", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" diff --git a/Cargo.toml b/Cargo.toml index 8230f76..1d08dce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" bio = "2.0.0" clap = { version = "4.5.8", features = ["derive"] } include_dir = "0.7.4" +intervaltree = "0.2.7" itertools = "0.13.0" rusqlite = { version = "0.31.0", features = ["bundled", "array"] } rusqlite_migration = { version = "1.2.0" , features = ["from-directory"]} diff --git a/src/models/path.rs b/src/models/path.rs index 2f47a31..dfcab60 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,4 +1,5 @@ use crate::models::{block::Block, new_edge::NewEdge, path_edge::PathEdge, sequence::Sequence}; +use intervaltree::IntervalTree; use itertools::Itertools; use petgraph::graphmap::DiGraphMap; use petgraph::prelude::Dfs; @@ -202,13 +203,16 @@ impl Path { let strand; let block_sequence; + let block_sequence_length; if end >= start { strand = "+"; block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + block_sequence_length = end - start; } else { strand = "-"; block_sequence = revcomp(&sequence.sequence[end as usize..start as usize + 1]); + block_sequence_length = start - end; } NewBlock { @@ -218,7 +222,7 @@ impl Path { sequence_start: start, sequence_end: end, path_start: current_path_length, - path_end: current_path_length + end, + path_end: current_path_length + block_sequence_length, strand: strand.to_string(), } } @@ -258,6 +262,15 @@ impl Path { } blocks } + + pub fn intervaltree_for(conn: &Connection, path: Path) -> IntervalTree { + let blocks = Path::blocks_for(conn, path); + let tree: IntervalTree = blocks + .into_iter() + .map(|block| (block.path_start..block.path_end, block)) + .collect(); + tree + } } #[derive(Debug)] @@ -477,4 +490,97 @@ mod tests { assert_eq!(revcomp("CNNNNA"), "TNNNNG"); assert_eq!(revcomp("cNNgnAt"), "aTncNNg"); } + + #[test] + fn test_intervaltree() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge1 = NewEdge::create( + conn, + None, + None, + Some(sequence1_hash.clone()), + Some(0), + 0, + 0, + ); + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = NewEdge::create( + conn, + Some(sequence1_hash.clone()), + Some(8), + Some(sequence2_hash.clone()), + Some(1), + 0, + 0, + ); + let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); + let edge3 = NewEdge::create( + conn, + Some(sequence2_hash.clone()), + Some(8), + Some(sequence3_hash.clone()), + Some(1), + 0, + 0, + ); + let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); + let edge4 = NewEdge::create( + conn, + Some(sequence3_hash.clone()), + Some(8), + Some(sequence4_hash.clone()), + Some(1), + 0, + 0, + ); + let edge5 = NewEdge::create( + conn, + Some(sequence4_hash.clone()), + Some(8), + None, + None, + 0, + 0, + ); + + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + let tree = Path::intervaltree_for(conn, path); + let blocks1: Vec<_> = tree.query_point(2).map(|x| x.value.clone()).collect(); + assert_eq!(blocks1.len(), 1); + let block1 = &blocks1[0]; + assert_eq!(block1.sequence.hash, sequence1_hash); + assert_eq!(block1.sequence_start, 0); + assert_eq!(block1.sequence_end, 8); + assert_eq!(block1.path_start, 0); + assert_eq!(block1.path_end, 8); + assert_eq!(block1.strand, "+"); + + let blocks2: Vec<_> = tree.query_point(12).map(|x| x.value.clone()).collect(); + assert_eq!(blocks2.len(), 1); + let block2 = &blocks2[0]; + assert_eq!(block2.sequence.hash, sequence2_hash); + assert_eq!(block2.sequence_start, 1); + assert_eq!(block2.sequence_end, 8); + assert_eq!(block2.path_start, 8); + assert_eq!(block2.path_end, 15); + assert_eq!(block2.strand, "+"); + + let blocks4: Vec<_> = tree.query_point(25).map(|x| x.value.clone()).collect(); + assert_eq!(blocks4.len(), 1); + let block4 = &blocks4[0]; + assert_eq!(block4.sequence.hash, sequence4_hash); + assert_eq!(block4.sequence_start, 1); + assert_eq!(block4.sequence_end, 8); + assert_eq!(block4.path_start, 22); + assert_eq!(block4.path_end, 29); + assert_eq!(block4.strand, "+"); + } }