From b1a09d94ce13c2cb92a0a4118e436261ade05a45 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 19 Aug 2024 11:25:05 -0400 Subject: [PATCH 01/18] Add new edge and path models --- migrations/01-initial/up.sql | 27 +++++++++- src/models.rs | 2 + src/models/new_edge.rs | 88 ++++++++++++++++++++++++++++++ src/models/path.rs | 19 ++++++- src/models/path_edge.rs | 102 +++++++++++++++++++++++++++++++++++ 5 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 src/models/new_edge.rs create mode 100644 src/models/path_edge.rs diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index 36d3c7d..3227709 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -79,4 +79,29 @@ CREATE TABLE change_log ( FOREIGN KEY(path_id) REFERENCES path(id), FOREIGN KEY(sequence_hash) REFERENCES sequence(hash) ); -CREATE UNIQUE INDEX change_log_uidx ON change_log(hash); \ No newline at end of file +CREATE UNIQUE INDEX change_log_uidx ON change_log(hash); + +CREATE TABLE new_edges ( + id INTEGER PRIMARY KEY NOT NULL, + source_hash TEXT, + source_coordinate INTEGER, + target_hash TEXT, + target_coordinate INTEGER, + chromosome_index INTEGER NOT NULL, + phased INTEGER NOT NULL, + FOREIGN KEY(source_hash) REFERENCES sequence(hash), + FOREIGN KEY(target_hash) REFERENCES sequence(hash), + constraint chk_phased check (phased in (0, 1)) +); +CREATE UNIQUE INDEX new_edge_uidx ON new_edges(source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased); + +CREATE TABLE path_edges ( + id INTEGER PRIMARY KEY NOT NULL, + path_id INTEGER NOT NULL, + source_edge_id INTEGER, + target_edge_id INTEGER, + FOREIGN KEY(source_edge_id) REFERENCES new_edges(id), + FOREIGN KEY(target_edge_id) REFERENCES new_edges(id), + FOREIGN KEY(path_id) REFERENCES path(id) +); +CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, source_edge_id, target_edge_id); diff --git a/src/models.rs b/src/models.rs index 2f20026..86d6294 100644 --- a/src/models.rs +++ b/src/models.rs @@ -8,7 +8,9 @@ use std::fmt::*; pub mod block; pub mod edge; +pub mod new_edge; pub mod path; +pub mod path_edge; pub mod sequence; use crate::graph::all_simple_paths; diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs new file mode 100644 index 0000000..b31c16f --- /dev/null +++ b/src/models/new_edge.rs @@ -0,0 +1,88 @@ +use rusqlite::types::Value; +use rusqlite::{params_from_iter, Connection}; + +#[derive(Debug)] +pub struct NewEdge { + pub id: i32, + pub source_hash: Option, + pub source_coordinate: Option, + pub target_hash: Option, + pub target_coordinate: Option, + pub chromosome_index: i32, + pub phased: i32, +} + +impl NewEdge { + pub fn create( + conn: &Connection, + source_hash: Option, + source_coordinate: Option, + target_hash: Option, + target_coordinate: Option, + chromosome_index: i32, + phased: i32, + ) -> NewEdge { + let query; + let id_query; + let mut placeholders: Vec = vec![]; + if target_hash.is_some() && source_hash.is_some() { + query = "INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6) RETURNING *"; + id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and target_hash = ?3 and target_coordinate = ?4 and chromosome_index = ?5 and phased = ?6"; + placeholders.push(source_hash.clone().unwrap().into()); + placeholders.push(source_coordinate.unwrap().into()); + placeholders.push(target_hash.clone().unwrap().into()); + placeholders.push(target_coordinate.unwrap().into()); + placeholders.push(chromosome_index.into()); + placeholders.push(phased.into()); + } else if target_hash.is_some() { + id_query = "select id from new_edges where target_hash = ?1 and target_coordinate = ?2 and source_hash is null and chromosome_index = ?3 and phased = ?4"; + query = "INSERT INTO new_edges (target_hash, target_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4) RETURNING *"; + placeholders.push(target_hash.clone().unwrap().into()); + placeholders.push(target_coordinate.unwrap().into()); + placeholders.push(chromosome_index.into()); + placeholders.push(phased.into()); + } else { + id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and target_id is null and chromosome_index = ?3 and phased = ?4"; + query = "INSERT INTO new_edges (source_hash, source_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4) RETURNING *"; + placeholders.push(source_hash.clone().unwrap().into()); + placeholders.push(source_coordinate.unwrap().into()); + placeholders.push(chromosome_index.into()); + placeholders.push(phased.into()); + } + let mut stmt = conn.prepare(query).unwrap(); + match stmt.query_row(params_from_iter(&placeholders), |row| { + Ok(NewEdge { + id: row.get(0)?, + source_hash: row.get(1)?, + source_coordinate: row.get(2)?, + target_hash: row.get(3)?, + target_coordinate: row.get(4)?, + chromosome_index: row.get(5)?, + phased: row.get(6)?, + }) + }) { + Ok(edge) => edge, + Err(rusqlite::Error::SqliteFailure(err, details)) => { + if err.code == rusqlite::ErrorCode::ConstraintViolation { + println!("{err:?} {details:?}"); + NewEdge { + id: conn + .query_row(id_query, params_from_iter(&placeholders), |row| row.get(0)) + .unwrap(), + source_hash, + source_coordinate, + target_hash, + target_coordinate, + chromosome_index, + phased, + } + } else { + panic!("something bad happened querying the database") + } + } + Err(_) => { + panic!("something bad happened querying the database") + } + } + } +} diff --git a/src/models/path.rs b/src/models/path.rs index d561089..f2339d9 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,5 +1,4 @@ -use crate::models::block::Block; -use crate::models::edge::Edge; +use crate::models::{block::Block, edge::Edge, path_edge::PathEdge}; use petgraph::graphmap::DiGraphMap; use petgraph::prelude::Dfs; use petgraph::Direction; @@ -47,6 +46,16 @@ pub fn revcomp(seq: &str) -> String { .unwrap() } +#[derive(Clone, Debug)] +pub struct NewBlock { + pub id: i32, + pub sequence_hash: String, + pub block_sequence: String, + pub start: i32, + pub end: i32, + pub strand: String, +} + impl Path { pub fn create(conn: &Connection, name: &str, block_group_id: i32, blocks: Vec) -> Path { let query = "INSERT INTO path (name, block_group_id) VALUES (?1, ?2) RETURNING (id)"; @@ -124,6 +133,12 @@ impl Path { } sequence } + + pub fn get_new_blocks(conn: &Connection, path_id: i32) -> Vec { + let mut new_blocks = vec![]; + let edges = PathEdge::edges_for(conn, path_id); + new_blocks + } } #[derive(Debug)] diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs new file mode 100644 index 0000000..06da710 --- /dev/null +++ b/src/models/path_edge.rs @@ -0,0 +1,102 @@ +use crate::models::new_edge::NewEdge; +use rusqlite::types::Value; +use rusqlite::{params_from_iter, Connection}; + +#[derive(Debug)] +pub struct PathEdge { + pub id: i32, + pub path_id: i32, + pub source_edge_id: Option, + pub target_edge_id: Option, +} + +impl PathEdge { + pub fn create( + conn: &Connection, + path_id: i32, + source_edge_id: Option, + target_edge_id: Option, + ) -> PathEdge { + let query = + "INSERT INTO path_edges (path_id, source_edge_id, target_edge_id) VALUES (?1, ?2, ?3) RETURNING (id)"; + let mut stmt = conn.prepare(query).unwrap(); + let mut rows = stmt + .query_map((path_id, source_edge_id, target_edge_id), |row| { + Ok(PathEdge { + id: row.get(0)?, + path_id, + source_edge_id, + target_edge_id, + }) + }) + .unwrap(); + match rows.next().unwrap() { + Ok(res) => res, + Err(rusqlite::Error::SqliteFailure(err, details)) => { + if err.code == rusqlite::ErrorCode::ConstraintViolation { + println!("{err:?} {details:?}"); + let query; + let mut placeholders = vec![path_id]; + if let Some(s) = source_edge_id { + if let Some(t) = target_edge_id { + query = "SELECT id from path_edges where path_id = ?1 AND source_edge_id = ?2 AND target_edge_id = ?3;"; + placeholders.push(s); + placeholders.push(t); + } else { + query = "SELECT id from path_edges where path_id = ?1 AND source_edge_id = ?2 AND target_edge_id is null;"; + placeholders.push(s); + } + } else if let Some(t) = target_edge_id { + query = "SELECT id from path_edges where path_id = ?1 AND source_edge_id is null AND target_edge_id = ?2;"; + placeholders.push(t); + } else { + panic!("No edge ids passed"); + } + println!("{query} {placeholders:?}"); + PathEdge { + id: conn + .query_row(query, params_from_iter(&placeholders), |row| row.get(0)) + .unwrap(), + path_id, + source_edge_id, + target_edge_id, + } + } else { + panic!("something bad happened querying the database") + } + } + Err(_) => { + panic!("something bad happened querying the database") + } + } + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params_from_iter(placeholders), |row| { + Ok(PathEdge { + id: row.get(0)?, + path_id: row.get(1)?, + source_edge_id: row.get(2)?, + target_edge_id: row.get(3)?, + }) + }) + .unwrap(); + let mut objs = vec![]; + for row in rows { + objs.push(row.unwrap()); + } + objs + } + + pub fn edges_for(conn: &Connection, path_id: i32) -> Vec { + let edges = vec![]; + let path_edges = PathEdge::query( + conn, + "select * from path_edges where path_id = ?1", + vec![Value::from(path_id)], + ); + edges + } +} From e320d7791f64c0550e3a84e84802fd24ee21f792 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 19 Aug 2024 19:10:19 -0400 Subject: [PATCH 02/18] New methods to generate sequence from path of new edges --- Cargo.lock | 1 + Cargo.toml | 1 + migrations/01-initial/up.sql | 9 +- src/main.rs | 2 +- src/models.rs | 28 ++--- src/models/new_edge.rs | 30 ++++- src/models/path.rs | 144 +++++++++++++++++++++-- src/models/path_edge.rs | 215 +++++++++++++++++++++++++++++------ src/models/sequence.rs | 20 +++- 9 files changed, 375 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b631a48..be95ac1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -591,6 +591,7 @@ dependencies = [ "bio", "clap", "include_dir", + "itertools", "noodles", "petgraph", "rusqlite", diff --git a/Cargo.toml b/Cargo.toml index 31ce0d3..8230f76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" bio = "2.0.0" clap = { version = "4.5.8", features = ["derive"] } include_dir = "0.7.4" +itertools = "0.13.0" rusqlite = { version = "0.31.0", features = ["bundled", "array"] } rusqlite_migration = { version = "1.2.0" , features = ["from-directory"]} sha2 = "0.10.8" diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index 3227709..7c39843 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -98,10 +98,9 @@ CREATE UNIQUE INDEX new_edge_uidx ON new_edges(source_hash, source_coordinate, t CREATE TABLE path_edges ( id INTEGER PRIMARY KEY NOT NULL, path_id INTEGER NOT NULL, - source_edge_id INTEGER, - target_edge_id INTEGER, - FOREIGN KEY(source_edge_id) REFERENCES new_edges(id), - FOREIGN KEY(target_edge_id) REFERENCES new_edges(id), + index_in_path INTEGER NOT NULL, + edge_id INTEGER NOT NULL, + FOREIGN KEY(edge_id) REFERENCES new_edges(id), FOREIGN KEY(path_id) REFERENCES path(id) ); -CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, source_edge_id, target_edge_id); +CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id); diff --git a/src/main.rs b/src/main.rs index bea2e9d..5c76689 100644 --- a/src/main.rs +++ b/src/main.rs @@ -302,7 +302,7 @@ mod tests { HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); assert_eq!( - Path::sequence(&conn, 1), + Path::get_sequence(&conn, 1), "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string() ); } diff --git a/src/models.rs b/src/models.rs index 86d6294..1d4abb4 100644 --- a/src/models.rs +++ b/src/models.rs @@ -277,16 +277,8 @@ impl BlockGroup { let sequence_hashes = block_map .values() .map(|block| format!("\"{id}\"", id = block.sequence_hash)) - .collect::>() - .join(","); - let mut sequence_map = HashMap::new(); - for sequence in Sequence::get_sequences( - conn, - &format!("select * from sequence where hash in ({sequence_hashes})"), - vec![], - ) { - sequence_map.insert(sequence.hash, sequence.sequence); - } + .collect::>(); + let sequence_map = Sequence::get_sequences_by_hash(conn, sequence_hashes); let block_ids = block_map .keys() .map(|id| format!("{id}")) @@ -323,7 +315,8 @@ impl BlockGroup { let block = block_map.get(&start_node).unwrap(); let block_sequence = sequence_map.get(&block.sequence_hash).unwrap(); sequences.insert( - block_sequence[(block.start as usize)..(block.end as usize)].to_string(), + block_sequence.sequence[(block.start as usize)..(block.end as usize)] + .to_string(), ); } else { for path in all_simple_paths(&graph, start_node, *end_node) { @@ -332,7 +325,8 @@ impl BlockGroup { let block = block_map.get(&node).unwrap(); let block_sequence = sequence_map.get(&block.sequence_hash).unwrap(); current_sequence.push_str( - &block_sequence[(block.start as usize)..(block.end as usize)], + &block_sequence.sequence + [(block.start as usize)..(block.end as usize)], ); } sequences.insert(current_sequence); @@ -428,20 +422,19 @@ impl BlockGroup { // |----range---| let start_split_point = block.start + start - path_start; let end_split_point = block.start + end - path_start; - let mut next_block; - if start_split_point == block.start { + let next_block = if start_split_point == block.start { if let Some(pb) = previous_block { new_edges.push((Some(pb.id), Some(new_block_id))); } - next_block = block.clone(); + block.clone() } else { let (left_block, right_block) = Block::split(conn, block, start_split_point, chromosome_index, phased) .unwrap(); Block::delete(conn, block.id); new_edges.push((Some(left_block.id), Some(new_block_id))); - next_block = right_block.clone(); - } + right_block.clone() + }; if end_split_point == next_block.start { new_edges.push((Some(new_block_id), Some(next_block.id))); @@ -585,7 +578,6 @@ impl ChangeLog { mod tests { use super::*; use crate::migrations::run_migrations; - use std::hash::Hash; fn get_connection() -> Connection { let mut conn = Connection::open_in_memory() diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs index b31c16f..7bd833c 100644 --- a/src/models/new_edge.rs +++ b/src/models/new_edge.rs @@ -1,7 +1,7 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct NewEdge { pub id: i32, pub source_hash: Option, @@ -85,4 +85,32 @@ impl NewEdge { } } } + + pub fn load(conn: &Connection, edge_ids: Vec) -> Vec { + let formatted_edge_ids = edge_ids + .into_iter() + .map(|edge_id| edge_id.to_string()) + .collect::>() + .join(","); + let query = format!("select id, source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased from new_edges where id in ({});", formatted_edge_ids); + let mut stmt = conn.prepare_cached(&query).unwrap(); + let rows = stmt + .query_map([], |row| { + Ok(NewEdge { + id: row.get(0)?, + source_hash: row.get(1)?, + source_coordinate: row.get(2)?, + target_hash: row.get(3)?, + target_coordinate: row.get(4)?, + chromosome_index: row.get(5)?, + phased: row.get(6)?, + }) + }) + .unwrap(); + let mut objs = vec![]; + for row in rows { + objs.push(row.unwrap()); + } + objs + } } diff --git a/src/models/path.rs b/src/models/path.rs index f2339d9..cf7a538 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,9 +1,12 @@ -use crate::models::{block::Block, edge::Edge, path_edge::PathEdge}; +use crate::models::{block::Block, new_edge::NewEdge, path_edge::PathEdge, sequence::Sequence}; use petgraph::graphmap::DiGraphMap; use petgraph::prelude::Dfs; use petgraph::Direction; use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; +use std::collections::{HashMap, HashSet}; + +use itertools::Itertools; #[derive(Debug)] pub struct Path { @@ -49,10 +52,12 @@ pub fn revcomp(seq: &str) -> String { #[derive(Clone, Debug)] pub struct NewBlock { pub id: i32, - pub sequence_hash: String, + pub sequence: Sequence, pub block_sequence: String, - pub start: i32, - pub end: i32, + pub sequence_start: i32, + pub sequence_end: i32, + pub path_start: i32, + pub path_end: i32, pub strand: String, } @@ -84,6 +89,33 @@ impl Path { path } + pub fn new_create( + conn: &Connection, + name: &str, + block_group_id: i32, + edge_ids: Vec, + ) -> Path { + let query = "INSERT INTO path (name, block_group_id) VALUES (?1, ?2) RETURNING (id)"; + let mut stmt = conn.prepare(query).unwrap(); + let mut rows = stmt + .query_map((name, block_group_id), |row| { + Ok(Path { + id: row.get(0)?, + name: name.to_string(), + block_group_id, + blocks: vec![], + }) + }) + .unwrap(); + let path = rows.next().unwrap().unwrap(); + + for (index, edge_id) in edge_ids.iter().enumerate() { + PathEdge::create(conn, path.id, index.try_into().unwrap(), *edge_id); + } + + path + } + pub fn get(conn: &mut Connection, path_id: i32) -> Path { let query = "SELECT id, block_group_id, name from path where id = ?1;"; let mut stmt = conn.prepare(query).unwrap(); @@ -120,7 +152,7 @@ impl Path { paths } - pub fn sequence(conn: &Connection, path_id: i32) -> String { + pub fn get_sequence(conn: &Connection, path_id: i32) -> String { let block_ids = PathBlock::get_blocks(conn, path_id); let mut sequence = "".to_string(); for block_id in block_ids { @@ -134,10 +166,98 @@ impl Path { sequence } - pub fn get_new_blocks(conn: &Connection, path_id: i32) -> Vec { - let mut new_blocks = vec![]; - let edges = PathEdge::edges_for(conn, path_id); - new_blocks + pub fn new_get_sequence(conn: &Connection, path: Path) -> String { + let blocks = Path::blocks_for(conn, path); + blocks + .into_iter() + .map(|block| block.block_sequence) + .collect::>() + .join("") + } + + pub fn edge_pairs_to_block( + block_id: i32, + path: &Path, + into: NewEdge, + out_of: NewEdge, + sequences_by_hash: &HashMap, + current_path_length: i32, + ) -> NewBlock { + if into.target_hash.is_none() || out_of.source_hash.is_none() { + panic!( + "Consecutive edges in path {} have None as internal block sequence", + path.id + ); + } + + if into.target_hash != out_of.source_hash { + panic!( + "Consecutive edges in path {0} don't share the same block", + path.id + ); + } + + let sequence = sequences_by_hash.get(&into.target_hash.unwrap()).unwrap(); + let start = into.target_coordinate.unwrap(); + let end = out_of.source_coordinate.unwrap(); + + let strand; + let block_sequence; + + if end >= start { + strand = "+"; + block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + } else { + strand = "-"; + block_sequence = revcomp(&sequence.sequence[end as usize..start as usize + 1]); + } + + NewBlock { + id: block_id, + sequence: sequence.clone(), + block_sequence, + sequence_start: start, + sequence_end: end, + path_start: current_path_length, + path_end: current_path_length + end, + strand: strand.to_string(), + } + } + + pub fn blocks_for(conn: &Connection, path: Path) -> Vec { + let edges = PathEdge::edges_for(conn, path.id); + let mut sequence_hashes = HashSet::new(); + for edge in &edges { + if edge.source_hash.is_some() { + sequence_hashes.insert(edge.source_hash.clone().unwrap()); + } + if edge.target_hash.is_some() { + sequence_hashes.insert(edge.target_hash.clone().unwrap()); + } + } + let sequences_by_hash = Sequence::get_sequences_by_hash( + conn, + sequence_hashes + .into_iter() + .map(|hash| format!("\"{hash}\"")) + .collect(), + ); + + let mut blocks = vec![]; + let mut path_length = 0; + for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { + let block = Path::edge_pairs_to_block( + index as i32, + &path, + into, + out_of, + &sequences_by_hash, + path_length, + ); + path_length += block.block_sequence.len() as i32; + blocks.push(block); + } + blocks } } @@ -290,7 +410,7 @@ mod tests { use super::*; use crate::migrations::run_migrations; - use crate::models::{sequence::Sequence, BlockGroup, Collection}; + use crate::models::{sequence::Sequence, BlockGroup, Collection, Edge}; fn get_connection() -> Connection { let mut conn = Connection::open_in_memory() @@ -323,7 +443,7 @@ mod tests { block_group.id, vec![block1.id, block2.id, block3.id], ); - assert_eq!(Path::sequence(conn, path.id), "ATCGATCGAAAAAAACCCCCCC"); + assert_eq!(Path::get_sequence(conn, path.id), "ATCGATCGAAAAAAACCCCCCC"); } #[test] @@ -349,7 +469,7 @@ mod tests { block_group.id, vec![block3.id, block2.id, block1.id], ); - assert_eq!(Path::sequence(conn, path.id), "GGGGGGGTTTTTTTCGATCGAT"); + assert_eq!(Path::get_sequence(conn, path.id), "GGGGGGGTTTTTTTCGATCGAT"); } #[test] diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index 06da710..1413611 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -1,32 +1,28 @@ -use crate::models::new_edge::NewEdge; +use crate::models::{new_edge::NewEdge, path::Path}; use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; +use std::collections::HashMap; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct PathEdge { pub id: i32, pub path_id: i32, - pub source_edge_id: Option, - pub target_edge_id: Option, + pub index_in_path: i32, + pub edge_id: i32, } impl PathEdge { - pub fn create( - conn: &Connection, - path_id: i32, - source_edge_id: Option, - target_edge_id: Option, - ) -> PathEdge { + pub fn create(conn: &Connection, path_id: i32, index_in_path: i32, edge_id: i32) -> PathEdge { let query = - "INSERT INTO path_edges (path_id, source_edge_id, target_edge_id) VALUES (?1, ?2, ?3) RETURNING (id)"; + "INSERT INTO path_edges (path_id, index_in_path, edge_id) VALUES (?1, ?2, ?3) RETURNING (id)"; let mut stmt = conn.prepare(query).unwrap(); let mut rows = stmt - .query_map((path_id, source_edge_id, target_edge_id), |row| { + .query_map((path_id, index_in_path, edge_id), |row| { Ok(PathEdge { id: row.get(0)?, path_id, - source_edge_id, - target_edge_id, + index_in_path, + edge_id, }) }) .unwrap(); @@ -35,31 +31,17 @@ impl PathEdge { Err(rusqlite::Error::SqliteFailure(err, details)) => { if err.code == rusqlite::ErrorCode::ConstraintViolation { println!("{err:?} {details:?}"); - let query; let mut placeholders = vec![path_id]; - if let Some(s) = source_edge_id { - if let Some(t) = target_edge_id { - query = "SELECT id from path_edges where path_id = ?1 AND source_edge_id = ?2 AND target_edge_id = ?3;"; - placeholders.push(s); - placeholders.push(t); - } else { - query = "SELECT id from path_edges where path_id = ?1 AND source_edge_id = ?2 AND target_edge_id is null;"; - placeholders.push(s); - } - } else if let Some(t) = target_edge_id { - query = "SELECT id from path_edges where path_id = ?1 AND source_edge_id is null AND target_edge_id = ?2;"; - placeholders.push(t); - } else { - panic!("No edge ids passed"); - } + let query = "SELECT id from path_edges where path_id = ?1 AND edge_id = ?2;"; + placeholders.push(edge_id); println!("{query} {placeholders:?}"); PathEdge { id: conn .query_row(query, params_from_iter(&placeholders), |row| row.get(0)) .unwrap(), path_id, - source_edge_id, - target_edge_id, + index_in_path, + edge_id, } } else { panic!("something bad happened querying the database") @@ -78,8 +60,8 @@ impl PathEdge { Ok(PathEdge { id: row.get(0)?, path_id: row.get(1)?, - source_edge_id: row.get(2)?, - target_edge_id: row.get(3)?, + index_in_path: row.get(2)?, + edge_id: row.get(3)?, }) }) .unwrap(); @@ -91,12 +73,171 @@ impl PathEdge { } pub fn edges_for(conn: &Connection, path_id: i32) -> Vec { - let edges = vec![]; let path_edges = PathEdge::query( conn, - "select * from path_edges where path_id = ?1", + "select * from path_edges where path_id = ?1 order by index_in_path ASC", vec![Value::from(path_id)], ); - edges + let edge_ids = path_edges.into_iter().map(|path_edge| path_edge.edge_id); + let edges = NewEdge::load(conn, edge_ids.clone().collect()); + let edges_by_id = edges + .into_iter() + .map(|edge| (edge.id, edge)) + .collect::>(); + edge_ids + .into_iter() + .map(|edge_id| edges_by_id[&edge_id].clone()) + .collect::>() + } +} + +mod tests { + use rusqlite::Connection; + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + + use crate::migrations::run_migrations; + use crate::models::{sequence::Sequence, BlockGroup, Collection}; + + fn get_connection() -> Connection { + let mut conn = Connection::open_in_memory() + .unwrap_or_else(|_| panic!("Error opening in memory test db")); + rusqlite::vtab::array::load_module(&conn).unwrap(); + run_migrations(&mut conn); + conn + } + + #[test] + fn test_gets_sequence() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge1 = NewEdge::create( + conn, + None, + None, + Some(sequence1_hash.clone()), + Some(0), + 0, + 0, + ); + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = NewEdge::create( + conn, + Some(sequence1_hash.clone()), + Some(8), + Some(sequence2_hash.clone()), + Some(1), + 0, + 0, + ); + let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); + let edge3 = NewEdge::create( + conn, + Some(sequence2_hash.clone()), + Some(8), + Some(sequence3_hash.clone()), + Some(1), + 0, + 0, + ); + let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); + let edge4 = NewEdge::create( + conn, + Some(sequence3_hash.clone()), + Some(8), + Some(sequence4_hash.clone()), + Some(1), + 0, + 0, + ); + let edge5 = NewEdge::create( + conn, + Some(sequence4_hash.clone()), + Some(8), + None, + None, + 0, + 0, + ); + + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + assert_eq!( + Path::new_get_sequence(conn, path), + "ATCGATCGAAAAAAACCCCCCCGGGGGGG" + ); + } + + #[test] + fn test_gets_sequence_with_rc() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge5 = NewEdge::create( + conn, + Some(sequence1_hash.clone()), + Some(0), + None, + None, + 0, + 0, + ); + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge4 = NewEdge::create( + conn, + Some(sequence2_hash.clone()), + Some(1), + Some(sequence1_hash.clone()), + Some(7), + 0, + 0, + ); + let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); + let edge3 = NewEdge::create( + conn, + Some(sequence3_hash.clone()), + Some(1), + Some(sequence2_hash.clone()), + Some(7), + 0, + 0, + ); + let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); + let edge2 = NewEdge::create( + conn, + Some(sequence4_hash.clone()), + Some(1), + Some(sequence3_hash.clone()), + Some(7), + 0, + 0, + ); + let edge1 = NewEdge::create( + conn, + None, + None, + Some(sequence4_hash.clone()), + Some(7), + 0, + 0, + ); + + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + assert_eq!( + Path::new_get_sequence(conn, path), + "CCCCCCCGGGGGGGTTTTTTTCGATCGAT" + ); } } diff --git a/src/models/sequence.rs b/src/models/sequence.rs index 1854e54..57a3ef3 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -1,8 +1,9 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use sha2::{Digest, Sha256}; +use std::collections::HashMap; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct Sequence { pub hash: String, pub sequence_type: String, @@ -67,4 +68,21 @@ impl Sequence { } objs } + + pub fn get_sequences_by_hash( + conn: &Connection, + hashes: Vec, + ) -> HashMap { + let mut sequence_map = HashMap::new(); + let joined_hashes = &hashes.join(","); + for sequence in Sequence::get_sequences( + conn, + &format!("select * from sequence where hash in ({0})", joined_hashes), + vec![], + ) { + sequence_map.insert(sequence.hash.clone(), sequence); + } + + sequence_map + } } From c25098b095cb4f095c8ce509159defb37dbdcc21 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 19 Aug 2024 19:16:17 -0400 Subject: [PATCH 03/18] More functional --- src/models/sequence.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/models/sequence.rs b/src/models/sequence.rs index 57a3ef3..d24c03a 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -73,16 +73,15 @@ impl Sequence { conn: &Connection, hashes: Vec, ) -> HashMap { - let mut sequence_map = HashMap::new(); let joined_hashes = &hashes.join(","); - for sequence in Sequence::get_sequences( + let sequences = Sequence::get_sequences( conn, &format!("select * from sequence where hash in ({0})", joined_hashes), vec![], - ) { - sequence_map.insert(sequence.hash.clone(), sequence); - } - - sequence_map + ); + sequences + .into_iter() + .map(|sequence| (sequence.hash.clone(), sequence)) + .collect::>() } } From 2392595ca4f3ea85506d1b0ba99d3447a17795cb Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 19 Aug 2024 19:23:35 -0400 Subject: [PATCH 04/18] undo weird naming --- src/main.rs | 2 +- src/models.rs | 2 +- src/models/path.rs | 10 +++++----- src/models/path_edge.rs | 4 ++-- src/models/sequence.rs | 13 +++---------- 5 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/main.rs b/src/main.rs index 5c76689..bea2e9d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -302,7 +302,7 @@ mod tests { HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); assert_eq!( - Path::get_sequence(&conn, 1), + Path::sequence(&conn, 1), "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string() ); } diff --git a/src/models.rs b/src/models.rs index 1d4abb4..70addee 100644 --- a/src/models.rs +++ b/src/models.rs @@ -278,7 +278,7 @@ impl BlockGroup { .values() .map(|block| format!("\"{id}\"", id = block.sequence_hash)) .collect::>(); - let sequence_map = Sequence::get_sequences_by_hash(conn, sequence_hashes); + let sequence_map = Sequence::sequences_by_hash(conn, sequence_hashes); let block_ids = block_map .keys() .map(|id| format!("{id}")) diff --git a/src/models/path.rs b/src/models/path.rs index cf7a538..fa191ed 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -152,7 +152,7 @@ impl Path { paths } - pub fn get_sequence(conn: &Connection, path_id: i32) -> String { + pub fn sequence(conn: &Connection, path_id: i32) -> String { let block_ids = PathBlock::get_blocks(conn, path_id); let mut sequence = "".to_string(); for block_id in block_ids { @@ -166,7 +166,7 @@ impl Path { sequence } - pub fn new_get_sequence(conn: &Connection, path: Path) -> String { + pub fn new_sequence(conn: &Connection, path: Path) -> String { let blocks = Path::blocks_for(conn, path); blocks .into_iter() @@ -235,7 +235,7 @@ impl Path { sequence_hashes.insert(edge.target_hash.clone().unwrap()); } } - let sequences_by_hash = Sequence::get_sequences_by_hash( + let sequences_by_hash = Sequence::sequences_by_hash( conn, sequence_hashes .into_iter() @@ -443,7 +443,7 @@ mod tests { block_group.id, vec![block1.id, block2.id, block3.id], ); - assert_eq!(Path::get_sequence(conn, path.id), "ATCGATCGAAAAAAACCCCCCC"); + assert_eq!(Path::sequence(conn, path.id), "ATCGATCGAAAAAAACCCCCCC"); } #[test] @@ -469,7 +469,7 @@ mod tests { block_group.id, vec![block3.id, block2.id, block1.id], ); - assert_eq!(Path::get_sequence(conn, path.id), "GGGGGGGTTTTTTTCGATCGAT"); + assert_eq!(Path::sequence(conn, path.id), "GGGGGGGTTTTTTTCGATCGAT"); } #[test] diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index 1413611..ce6ccbe 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -169,7 +169,7 @@ mod tests { vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); assert_eq!( - Path::new_get_sequence(conn, path), + Path::new_sequence(conn, path), "ATCGATCGAAAAAAACCCCCCCGGGGGGG" ); } @@ -236,7 +236,7 @@ mod tests { vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); assert_eq!( - Path::new_get_sequence(conn, path), + Path::new_sequence(conn, path), "CCCCCCCGGGGGGGTTTTTTTCGATCGAT" ); } diff --git a/src/models/sequence.rs b/src/models/sequence.rs index d24c03a..ce6dd65 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -46,11 +46,7 @@ impl Sequence { obj_hash } - pub fn get_sequences( - conn: &Connection, - query: &str, - placeholders: Vec, - ) -> Vec { + pub fn sequences(conn: &Connection, query: &str, placeholders: Vec) -> Vec { let mut stmt = conn.prepare_cached(query).unwrap(); let rows = stmt .query_map(params_from_iter(placeholders), |row| { @@ -69,12 +65,9 @@ impl Sequence { objs } - pub fn get_sequences_by_hash( - conn: &Connection, - hashes: Vec, - ) -> HashMap { + pub fn sequences_by_hash(conn: &Connection, hashes: Vec) -> HashMap { let joined_hashes = &hashes.join(","); - let sequences = Sequence::get_sequences( + let sequences = Sequence::sequences( conn, &format!("select * from sequence where hash in ({0})", joined_hashes), vec![], From 8c149e8a9203dec3b45fdefda2bce82becca8ea8 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 19 Aug 2024 19:24:18 -0400 Subject: [PATCH 05/18] Import rework --- src/models/path.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/models/path.rs b/src/models/path.rs index fa191ed..2f47a31 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,4 +1,5 @@ use crate::models::{block::Block, new_edge::NewEdge, path_edge::PathEdge, sequence::Sequence}; +use itertools::Itertools; use petgraph::graphmap::DiGraphMap; use petgraph::prelude::Dfs; use petgraph::Direction; @@ -6,8 +7,6 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use std::collections::{HashMap, HashSet}; -use itertools::Itertools; - #[derive(Debug)] pub struct Path { pub id: i32, From 447ceeffb500d8294864134707450fcd1046e4d4 Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 20 Aug 2024 12:56:49 -0400 Subject: [PATCH 06/18] Add method for returning intervaltree mapping path ranges to blocks --- Cargo.lock | 10 +++++ Cargo.toml | 1 + src/models/path.rs | 108 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index be95ac1..12cb1b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -591,6 +591,7 @@ dependencies = [ "bio", "clap", "include_dir", + "intervaltree", "itertools", "noodles", "petgraph", @@ -679,6 +680,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "intervaltree" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "270bc34e57047cab801a8c871c124d9dc7132f6473c6401f645524f4e6edd111" +dependencies = [ + "smallvec", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" diff --git a/Cargo.toml b/Cargo.toml index 8230f76..1d08dce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" bio = "2.0.0" clap = { version = "4.5.8", features = ["derive"] } include_dir = "0.7.4" +intervaltree = "0.2.7" itertools = "0.13.0" rusqlite = { version = "0.31.0", features = ["bundled", "array"] } rusqlite_migration = { version = "1.2.0" , features = ["from-directory"]} diff --git a/src/models/path.rs b/src/models/path.rs index 2f47a31..dfcab60 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,4 +1,5 @@ use crate::models::{block::Block, new_edge::NewEdge, path_edge::PathEdge, sequence::Sequence}; +use intervaltree::IntervalTree; use itertools::Itertools; use petgraph::graphmap::DiGraphMap; use petgraph::prelude::Dfs; @@ -202,13 +203,16 @@ impl Path { let strand; let block_sequence; + let block_sequence_length; if end >= start { strand = "+"; block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + block_sequence_length = end - start; } else { strand = "-"; block_sequence = revcomp(&sequence.sequence[end as usize..start as usize + 1]); + block_sequence_length = start - end; } NewBlock { @@ -218,7 +222,7 @@ impl Path { sequence_start: start, sequence_end: end, path_start: current_path_length, - path_end: current_path_length + end, + path_end: current_path_length + block_sequence_length, strand: strand.to_string(), } } @@ -258,6 +262,15 @@ impl Path { } blocks } + + pub fn intervaltree_for(conn: &Connection, path: Path) -> IntervalTree { + let blocks = Path::blocks_for(conn, path); + let tree: IntervalTree = blocks + .into_iter() + .map(|block| (block.path_start..block.path_end, block)) + .collect(); + tree + } } #[derive(Debug)] @@ -477,4 +490,97 @@ mod tests { assert_eq!(revcomp("CNNNNA"), "TNNNNG"); assert_eq!(revcomp("cNNgnAt"), "aTncNNg"); } + + #[test] + fn test_intervaltree() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let block_group = BlockGroup::create(conn, "test collection", None, "test block group"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge1 = NewEdge::create( + conn, + None, + None, + Some(sequence1_hash.clone()), + Some(0), + 0, + 0, + ); + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = NewEdge::create( + conn, + Some(sequence1_hash.clone()), + Some(8), + Some(sequence2_hash.clone()), + Some(1), + 0, + 0, + ); + let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); + let edge3 = NewEdge::create( + conn, + Some(sequence2_hash.clone()), + Some(8), + Some(sequence3_hash.clone()), + Some(1), + 0, + 0, + ); + let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); + let edge4 = NewEdge::create( + conn, + Some(sequence3_hash.clone()), + Some(8), + Some(sequence4_hash.clone()), + Some(1), + 0, + 0, + ); + let edge5 = NewEdge::create( + conn, + Some(sequence4_hash.clone()), + Some(8), + None, + None, + 0, + 0, + ); + + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + let tree = Path::intervaltree_for(conn, path); + let blocks1: Vec<_> = tree.query_point(2).map(|x| x.value.clone()).collect(); + assert_eq!(blocks1.len(), 1); + let block1 = &blocks1[0]; + assert_eq!(block1.sequence.hash, sequence1_hash); + assert_eq!(block1.sequence_start, 0); + assert_eq!(block1.sequence_end, 8); + assert_eq!(block1.path_start, 0); + assert_eq!(block1.path_end, 8); + assert_eq!(block1.strand, "+"); + + let blocks2: Vec<_> = tree.query_point(12).map(|x| x.value.clone()).collect(); + assert_eq!(blocks2.len(), 1); + let block2 = &blocks2[0]; + assert_eq!(block2.sequence.hash, sequence2_hash); + assert_eq!(block2.sequence_start, 1); + assert_eq!(block2.sequence_end, 8); + assert_eq!(block2.path_start, 8); + assert_eq!(block2.path_end, 15); + assert_eq!(block2.strand, "+"); + + let blocks4: Vec<_> = tree.query_point(25).map(|x| x.value.clone()).collect(); + assert_eq!(blocks4.len(), 1); + let block4 = &blocks4[0]; + assert_eq!(block4.sequence.hash, sequence4_hash); + assert_eq!(block4.sequence_start, 1); + assert_eq!(block4.sequence_end, 8); + assert_eq!(block4.path_start, 22); + assert_eq!(block4.path_end, 29); + assert_eq!(block4.strand, "+"); + } } From 6d5155cebdfee8ee1533f197a0eed8b486545caf Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 20 Aug 2024 15:46:49 -0400 Subject: [PATCH 07/18] Add new_insert_change --- src/models.rs | 109 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/src/models.rs b/src/models.rs index 70addee..a6ce48f 100644 --- a/src/models.rs +++ b/src/models.rs @@ -16,7 +16,8 @@ pub mod sequence; use crate::graph::all_simple_paths; use crate::models::block::Block; use crate::models::edge::Edge; -use crate::models::path::{Path, PathBlock}; +use crate::models::new_edge::NewEdge; +use crate::models::path::{NewBlock, Path, PathBlock}; use crate::models::sequence::Sequence; use crate::{get_overlap, models}; @@ -501,6 +502,112 @@ impl BlockGroup { change.save(conn); } + + #[allow(clippy::ptr_arg)] + #[allow(clippy::too_many_arguments)] + pub fn new_insert_change( + conn: &mut Connection, + path: Path, + start: i32, + end: i32, + new_block: &NewBlock, + chromosome_index: i32, + phased: i32, + ) -> Vec { + // todo: + // cases to check: + // change that is the size of a block + // change that goes over multiple blocks + // change that hits just start/end boundary, e.g. block is 1,5 and change is 3,5 or 1,3. + // change that deletes block boundary + + let tree = Path::intervaltree_for(conn, path); + + let start_blocks: Vec = + tree.query_point(start).map(|x| x.value.clone()).collect(); + assert_eq!(start_blocks.len(), 1); + let start_block = &start_blocks[0]; + + let end_blocks: Vec = tree.query_point(end).map(|x| x.value.clone()).collect(); + assert_eq!(end_blocks.len(), 1); + let end_block = &end_blocks[0]; + + let mut new_edges = vec![]; + + if new_block.sequence_start == new_block.sequence_end { + // Deletion + let new_edge = NewEdge { + id: 0, + source_hash: Some(start_block.sequence.hash.clone()), + source_coordinate: Some( + start - start_block.path_start + start_block.sequence_start, + ), + target_hash: Some(end_block.sequence.hash.clone()), + target_coordinate: Some(end - end_block.path_start + end_block.sequence_start), + chromosome_index, + phased, + }; + new_edges.push(new_edge); + } else { + // Insertion/replacement + let new_start_edge = NewEdge { + id: 0, + source_hash: Some(start_block.sequence.hash.clone()), + source_coordinate: Some( + start - start_block.path_start + start_block.sequence_start, + ), + target_hash: Some(new_block.sequence.hash.clone()), + target_coordinate: Some(new_block.sequence_start), + chromosome_index, + phased, + }; + let new_end_edge = NewEdge { + id: 0, + source_hash: Some(new_block.sequence.hash.clone()), + source_coordinate: Some(new_block.sequence_end), + target_hash: Some(end_block.sequence.hash.clone()), + target_coordinate: Some(end - end_block.path_start + end_block.sequence_start), + chromosome_index, + phased, + }; + new_edges.push(new_start_edge); + new_edges.push(new_end_edge); + } + + // NOTE: Add edges marking the existing "block" that is being substituted out, so we can + // retrieve it as one node of the overall graph + if start < start_block.path_end { + let split_coordinate = start - start_block.path_start + start_block.sequence_start; + let new_split_start_edge = NewEdge { + id: 0, + source_hash: Some(start_block.sequence.hash.clone()), + source_coordinate: Some(split_coordinate), + target_hash: Some(start_block.sequence.hash.clone()), + target_coordinate: Some(split_coordinate + 1), + chromosome_index, + phased, + }; + new_edges.push(new_split_start_edge); + } + + if end > end_block.path_start { + let split_coordinate = end - end_block.path_start + end_block.sequence_start; + let new_split_end_edge = NewEdge { + id: 0, + source_hash: Some(new_block.sequence.hash.clone()), + source_coordinate: Some(split_coordinate), + target_hash: Some(end_block.sequence.hash.clone()), + target_coordinate: Some(split_coordinate + 1), + chromosome_index, + phased, + }; + + new_edges.push(new_split_end_edge); + } + + // TODO: bulk create-or-get of new edges + new_edges + } } pub struct ChangeLog { From a67c60f9ef4f83c1d3e6b057f26226e0a5a2f2be Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 20 Aug 2024 21:12:39 -0400 Subject: [PATCH 08/18] Add bulk create method for new edges --- migrations/01-initial/up.sql | 12 +- src/models.rs | 44 +++-- src/models/new_edge.rs | 304 ++++++++++++++++++++++++++++++----- src/models/path.rs | 61 ++++--- src/models/path_edge.rs | 82 +++++----- 5 files changed, 358 insertions(+), 145 deletions(-) diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index 7c39843..7abdfbf 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -9,7 +9,7 @@ CREATE TABLE sample ( CREATE TABLE sequence ( hash TEXT PRIMARY KEY NOT NULL, sequence_type TEXT NOT NULL, - sequence TEXT, + sequence TEXT NOT NULL, "length" INTEGER NOT NULL ); @@ -83,10 +83,10 @@ CREATE UNIQUE INDEX change_log_uidx ON change_log(hash); CREATE TABLE new_edges ( id INTEGER PRIMARY KEY NOT NULL, - source_hash TEXT, - source_coordinate INTEGER, - target_hash TEXT, - target_coordinate INTEGER, + source_hash TEXT NOT NULL, + source_coordinate INTEGER NOT NULL, + target_hash TEXT NOT NULL, + target_coordinate INTEGER NOT NULL, chromosome_index INTEGER NOT NULL, phased INTEGER NOT NULL, FOREIGN KEY(source_hash) REFERENCES sequence(hash), @@ -104,3 +104,5 @@ CREATE TABLE path_edges ( FOREIGN KEY(path_id) REFERENCES path(id) ); CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id); + +INSERT INTO sequence (hash, sequence_type, sequence, "length") values ("yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", 64), ("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 64); diff --git a/src/models.rs b/src/models.rs index a6ce48f..bf233fa 100644 --- a/src/models.rs +++ b/src/models.rs @@ -538,12 +538,10 @@ impl BlockGroup { // Deletion let new_edge = NewEdge { id: 0, - source_hash: Some(start_block.sequence.hash.clone()), - source_coordinate: Some( - start - start_block.path_start + start_block.sequence_start, - ), - target_hash: Some(end_block.sequence.hash.clone()), - target_coordinate: Some(end - end_block.path_start + end_block.sequence_start), + source_hash: start_block.sequence.hash.clone(), + source_coordinate: start - start_block.path_start + start_block.sequence_start, + target_hash: end_block.sequence.hash.clone(), + target_coordinate: end - end_block.path_start + end_block.sequence_start, chromosome_index, phased, }; @@ -552,21 +550,19 @@ impl BlockGroup { // Insertion/replacement let new_start_edge = NewEdge { id: 0, - source_hash: Some(start_block.sequence.hash.clone()), - source_coordinate: Some( - start - start_block.path_start + start_block.sequence_start, - ), - target_hash: Some(new_block.sequence.hash.clone()), - target_coordinate: Some(new_block.sequence_start), + source_hash: start_block.sequence.hash.clone(), + source_coordinate: start - start_block.path_start + start_block.sequence_start, + target_hash: new_block.sequence.hash.clone(), + target_coordinate: new_block.sequence_start, chromosome_index, phased, }; let new_end_edge = NewEdge { id: 0, - source_hash: Some(new_block.sequence.hash.clone()), - source_coordinate: Some(new_block.sequence_end), - target_hash: Some(end_block.sequence.hash.clone()), - target_coordinate: Some(end - end_block.path_start + end_block.sequence_start), + source_hash: new_block.sequence.hash.clone(), + source_coordinate: new_block.sequence_end, + target_hash: end_block.sequence.hash.clone(), + target_coordinate: end - end_block.path_start + end_block.sequence_start, chromosome_index, phased, }; @@ -580,10 +576,10 @@ impl BlockGroup { let split_coordinate = start - start_block.path_start + start_block.sequence_start; let new_split_start_edge = NewEdge { id: 0, - source_hash: Some(start_block.sequence.hash.clone()), - source_coordinate: Some(split_coordinate), - target_hash: Some(start_block.sequence.hash.clone()), - target_coordinate: Some(split_coordinate + 1), + source_hash: start_block.sequence.hash.clone(), + source_coordinate: split_coordinate, + target_hash: start_block.sequence.hash.clone(), + target_coordinate: split_coordinate + 1, chromosome_index, phased, }; @@ -594,10 +590,10 @@ impl BlockGroup { let split_coordinate = end - end_block.path_start + end_block.sequence_start; let new_split_end_edge = NewEdge { id: 0, - source_hash: Some(new_block.sequence.hash.clone()), - source_coordinate: Some(split_coordinate), - target_hash: Some(end_block.sequence.hash.clone()), - target_coordinate: Some(split_coordinate + 1), + source_hash: new_block.sequence.hash.clone(), + source_coordinate: split_coordinate, + target_hash: end_block.sequence.hash.clone(), + target_coordinate: split_coordinate + 1, chromosome_index, phased, }; diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs index 7bd833c..57aa5fc 100644 --- a/src/models/new_edge.rs +++ b/src/models/new_edge.rs @@ -1,54 +1,55 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; +use std::collections::HashSet; +use std::hash::RandomState; #[derive(Clone, Debug)] pub struct NewEdge { pub id: i32, - pub source_hash: Option, - pub source_coordinate: Option, - pub target_hash: Option, - pub target_coordinate: Option, + pub source_hash: String, + pub source_coordinate: i32, + pub target_hash: String, + pub target_coordinate: i32, + pub chromosome_index: i32, + pub phased: i32, +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct EdgeData { + pub source_hash: String, + pub source_coordinate: i32, + pub target_hash: String, + pub target_coordinate: i32, pub chromosome_index: i32, pub phased: i32, } impl NewEdge { + pub const PATH_START_HASH: &'static str = + "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; + pub const PATH_END_HASH: &'static str = + "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"; + pub fn create( conn: &Connection, - source_hash: Option, - source_coordinate: Option, - target_hash: Option, - target_coordinate: Option, + source_hash: String, + source_coordinate: i32, + target_hash: String, + target_coordinate: i32, chromosome_index: i32, phased: i32, ) -> NewEdge { - let query; - let id_query; - let mut placeholders: Vec = vec![]; - if target_hash.is_some() && source_hash.is_some() { - query = "INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6) RETURNING *"; - id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and target_hash = ?3 and target_coordinate = ?4 and chromosome_index = ?5 and phased = ?6"; - placeholders.push(source_hash.clone().unwrap().into()); - placeholders.push(source_coordinate.unwrap().into()); - placeholders.push(target_hash.clone().unwrap().into()); - placeholders.push(target_coordinate.unwrap().into()); - placeholders.push(chromosome_index.into()); - placeholders.push(phased.into()); - } else if target_hash.is_some() { - id_query = "select id from new_edges where target_hash = ?1 and target_coordinate = ?2 and source_hash is null and chromosome_index = ?3 and phased = ?4"; - query = "INSERT INTO new_edges (target_hash, target_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4) RETURNING *"; - placeholders.push(target_hash.clone().unwrap().into()); - placeholders.push(target_coordinate.unwrap().into()); - placeholders.push(chromosome_index.into()); - placeholders.push(phased.into()); - } else { - id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and target_id is null and chromosome_index = ?3 and phased = ?4"; - query = "INSERT INTO new_edges (source_hash, source_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4) RETURNING *"; - placeholders.push(source_hash.clone().unwrap().into()); - placeholders.push(source_coordinate.unwrap().into()); - placeholders.push(chromosome_index.into()); - placeholders.push(phased.into()); - } + let query = "INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6) RETURNING *"; + let id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and target_hash = ?3 and target_coordinate = ?4 and chromosome_index = ?5 and phased = ?6"; + let mut placeholders: Vec = vec![ + source_hash.clone().into(), + source_coordinate.into(), + target_hash.clone().into(), + target_coordinate.into(), + chromosome_index.into(), + phased.into(), + ]; + let mut stmt = conn.prepare(query).unwrap(); match stmt.query_row(params_from_iter(&placeholders), |row| { Ok(NewEdge { @@ -86,16 +87,20 @@ impl NewEdge { } } - pub fn load(conn: &Connection, edge_ids: Vec) -> Vec { + pub fn bulk_load(conn: &Connection, edge_ids: Vec) -> Vec { let formatted_edge_ids = edge_ids .into_iter() .map(|edge_id| edge_id.to_string()) .collect::>() .join(","); let query = format!("select id, source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased from new_edges where id in ({});", formatted_edge_ids); - let mut stmt = conn.prepare_cached(&query).unwrap(); + NewEdge::query(conn, &query, vec![]) + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); let rows = stmt - .query_map([], |row| { + .query_map(params_from_iter(placeholders), |row| { Ok(NewEdge { id: row.get(0)?, source_hash: row.get(1)?, @@ -107,10 +112,227 @@ impl NewEdge { }) }) .unwrap(); - let mut objs = vec![]; + let mut edges = vec![]; for row in rows { - objs.push(row.unwrap()); + edges.push(row.unwrap()); + } + edges + } + + pub fn bulk_create(conn: &Connection, edges: Vec) -> Vec { + let mut edge_rows = vec![]; + for edge in &edges { + let source_hash = format!("\"{0}\"", edge.source_hash); + let target_hash = format!("\"{0}\"", edge.target_hash); + let edge_row = format!( + "({0}, {1}, {2}, {3}, {4}, {5})", + source_hash, + edge.source_coordinate, + target_hash, + edge.target_coordinate, + edge.chromosome_index, + edge.phased + ); + edge_rows.push(edge_row); } - objs + let formatted_edge_rows = edge_rows.join(", "); + + let select_statement = format!("SELECT * FROM new_edges WHERE (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) in ({0});", formatted_edge_rows); + let existing_edges = NewEdge::query(conn, &select_statement, vec![]); + let mut existing_edge_ids: Vec = existing_edges + .clone() + .into_iter() + .map(|edge| edge.id) + .collect(); + + let existing_edge_set = HashSet::::from_iter( + existing_edges.into_iter().map(NewEdge::to_data), + ); + let mut edges_to_insert = HashSet::new(); + for edge in &edges { + if !existing_edge_set.contains(edge) { + edges_to_insert.insert(edge); + } + } + + let mut edge_rows_to_insert = vec![]; + for edge in edges_to_insert { + let source_hash = format!("\"{0}\"", edge.source_hash); + let target_hash = format!("\"{0}\"", edge.target_hash); + let edge_row = format!( + "({0}, {1}, {2}, {3}, {4}, {5})", + source_hash, + edge.source_coordinate, + target_hash, + edge.target_coordinate, + edge.chromosome_index, + edge.phased + ); + edge_rows_to_insert.push(edge_row); + } + let formatted_edge_rows_to_insert = edge_rows_to_insert.join(", "); + + let insert_statement = format!("INSERT OR IGNORE INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES {0} RETURNING (id);", formatted_edge_rows_to_insert); + let mut stmt = conn.prepare(&insert_statement).unwrap(); + let rows = stmt.query_map([], |row| row.get(0)).unwrap(); + let mut edge_ids: Vec = vec![]; + for row in rows { + edge_ids.push(row.unwrap()); + } + + existing_edge_ids.extend(edge_ids); + existing_edge_ids + } + + pub fn to_data(edge: NewEdge) -> EdgeData { + EdgeData { + source_hash: edge.source_hash, + source_coordinate: edge.source_coordinate, + target_hash: edge.target_hash, + target_coordinate: edge.target_coordinate, + chromosome_index: edge.chromosome_index, + phased: edge.phased, + } + } +} + +mod tests { + use rusqlite::Connection; + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + + use crate::migrations::run_migrations; + use crate::models::{sequence::Sequence, Collection}; + + fn get_connection() -> Connection { + let mut conn = Connection::open_in_memory() + .unwrap_or_else(|_| panic!("Error opening in memory test db")); + rusqlite::vtab::array::load_module(&conn).unwrap(); + run_migrations(&mut conn); + conn + } + + #[test] + fn test_bulk_create() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + let edge1 = EdgeData { + source_hash: NewEdge::PATH_START_HASH.to_string(), + source_coordinate: -1, + target_hash: sequence1_hash.clone(), + target_coordinate: 1, + chromosome_index: 0, + phased: 0, + }; + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = EdgeData { + source_hash: sequence1_hash.clone(), + source_coordinate: 2, + target_hash: sequence2_hash.clone(), + target_coordinate: 3, + chromosome_index: 0, + phased: 0, + }; + let edge3 = EdgeData { + source_hash: sequence2_hash.clone(), + source_coordinate: 4, + target_hash: NewEdge::PATH_END_HASH.to_string(), + target_coordinate: -1, + chromosome_index: 0, + phased: 0, + }; + + let edge_ids = NewEdge::bulk_create(conn, vec![edge1, edge2, edge3]); + assert_eq!(edge_ids.len(), 3); + let edges = NewEdge::bulk_load(conn, edge_ids); + assert_eq!(edges.len(), 3); + + let edge_result1 = &edges[0]; + assert_eq!(edge_result1.source_hash, NewEdge::PATH_START_HASH); + assert_eq!(edge_result1.source_coordinate, -1); + assert_eq!(edge_result1.target_hash, sequence1_hash); + assert_eq!(edge_result1.target_coordinate, 1); + let edge_result2 = &edges[1]; + assert_eq!(edge_result2.source_hash, sequence1_hash); + assert_eq!(edge_result2.source_coordinate, 2); + assert_eq!(edge_result2.target_hash, sequence2_hash); + assert_eq!(edge_result2.target_coordinate, 3); + let edge_result3 = &edges[2]; + assert_eq!(edge_result3.source_hash, sequence2_hash); + assert_eq!(edge_result3.source_coordinate, 4); + assert_eq!(edge_result3.target_hash, NewEdge::PATH_END_HASH); + assert_eq!(edge_result3.target_coordinate, -1); + } + + #[test] + fn test_bulk_create_with_existing_edge() { + let conn = &mut get_connection(); + Collection::create(conn, "test collection"); + let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); + // NOTE: Create one edge ahead of time to confirm an existing row ID gets returned in the bulk create + let existing_edge = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + -1, + sequence1_hash.clone(), + 1, + 0, + 0, + ); + assert_eq!(existing_edge.source_hash, NewEdge::PATH_START_HASH); + assert_eq!(existing_edge.source_coordinate, -1); + assert_eq!(existing_edge.target_hash, sequence1_hash); + assert_eq!(existing_edge.target_coordinate, 1); + + let edge1 = EdgeData { + source_hash: NewEdge::PATH_START_HASH.to_string(), + source_coordinate: -1, + target_hash: sequence1_hash.clone(), + target_coordinate: 1, + chromosome_index: 0, + phased: 0, + }; + let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); + let edge2 = EdgeData { + source_hash: sequence1_hash.clone(), + source_coordinate: 2, + target_hash: sequence2_hash.clone(), + target_coordinate: 3, + chromosome_index: 0, + phased: 0, + }; + let edge3 = EdgeData { + source_hash: sequence2_hash.clone(), + source_coordinate: 4, + target_hash: NewEdge::PATH_END_HASH.to_string(), + target_coordinate: -1, + chromosome_index: 0, + phased: 0, + }; + + let edge_ids = NewEdge::bulk_create(conn, vec![edge1, edge2, edge3]); + assert_eq!(edge_ids.len(), 3); + let edges = NewEdge::bulk_load(conn, edge_ids); + assert_eq!(edges.len(), 3); + + let edge_result1 = &edges[0]; + + assert_eq!(edge_result1.id, existing_edge.id); + + assert_eq!(edge_result1.source_hash, NewEdge::PATH_START_HASH); + assert_eq!(edge_result1.source_coordinate, -1); + assert_eq!(edge_result1.target_hash, sequence1_hash); + assert_eq!(edge_result1.target_coordinate, 1); + let edge_result2 = &edges[2]; + assert_eq!(edge_result2.source_hash, sequence1_hash); + assert_eq!(edge_result2.source_coordinate, 2); + assert_eq!(edge_result2.target_hash, sequence2_hash); + assert_eq!(edge_result2.target_coordinate, 3); + let edge_result3 = &edges[1]; + assert_eq!(edge_result3.source_hash, sequence2_hash); + assert_eq!(edge_result3.source_coordinate, 4); + assert_eq!(edge_result3.target_hash, NewEdge::PATH_END_HASH); + assert_eq!(edge_result3.target_coordinate, -1); } } diff --git a/src/models/path.rs b/src/models/path.rs index dfcab60..8a302fb 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -183,13 +183,6 @@ impl Path { sequences_by_hash: &HashMap, current_path_length: i32, ) -> NewBlock { - if into.target_hash.is_none() || out_of.source_hash.is_none() { - panic!( - "Consecutive edges in path {} have None as internal block sequence", - path.id - ); - } - if into.target_hash != out_of.source_hash { panic!( "Consecutive edges in path {0} don't share the same block", @@ -197,9 +190,9 @@ impl Path { ); } - let sequence = sequences_by_hash.get(&into.target_hash.unwrap()).unwrap(); - let start = into.target_coordinate.unwrap(); - let end = out_of.source_coordinate.unwrap(); + let sequence = sequences_by_hash.get(&into.target_hash).unwrap(); + let start = into.target_coordinate; + let end = out_of.source_coordinate; let strand; let block_sequence; @@ -231,11 +224,11 @@ impl Path { let edges = PathEdge::edges_for(conn, path.id); let mut sequence_hashes = HashSet::new(); for edge in &edges { - if edge.source_hash.is_some() { - sequence_hashes.insert(edge.source_hash.clone().unwrap()); + if edge.source_hash != NewEdge::PATH_START_HASH { + sequence_hashes.insert(edge.source_hash.clone()); } - if edge.target_hash.is_some() { - sequence_hashes.insert(edge.target_hash.clone().unwrap()); + if edge.target_hash != NewEdge::PATH_END_HASH { + sequence_hashes.insert(edge.target_hash.clone()); } } let sequences_by_hash = Sequence::sequences_by_hash( @@ -499,49 +492,49 @@ mod tests { let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); let edge1 = NewEdge::create( conn, - None, - None, - Some(sequence1_hash.clone()), - Some(0), + NewEdge::PATH_START_HASH.to_string(), + -1, + sequence1_hash.clone(), + 0, 0, 0, ); let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); let edge2 = NewEdge::create( conn, - Some(sequence1_hash.clone()), - Some(8), - Some(sequence2_hash.clone()), - Some(1), + sequence1_hash.clone(), + 8, + sequence2_hash.clone(), + 1, 0, 0, ); let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); let edge3 = NewEdge::create( conn, - Some(sequence2_hash.clone()), - Some(8), - Some(sequence3_hash.clone()), - Some(1), + sequence2_hash.clone(), + 8, + sequence3_hash.clone(), + 1, 0, 0, ); let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); let edge4 = NewEdge::create( conn, - Some(sequence3_hash.clone()), - Some(8), - Some(sequence4_hash.clone()), - Some(1), + sequence3_hash.clone(), + 8, + sequence4_hash.clone(), + 1, 0, 0, ); let edge5 = NewEdge::create( conn, - Some(sequence4_hash.clone()), - Some(8), - None, - None, + sequence4_hash.clone(), + 8, + NewEdge::PATH_END_HASH.to_string(), + -1, 0, 0, ); diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index ce6ccbe..d6ede1d 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -79,7 +79,7 @@ impl PathEdge { vec![Value::from(path_id)], ); let edge_ids = path_edges.into_iter().map(|path_edge| path_edge.edge_id); - let edges = NewEdge::load(conn, edge_ids.clone().collect()); + let edges = NewEdge::bulk_load(conn, edge_ids.clone().collect()); let edges_by_id = edges .into_iter() .map(|edge| (edge.id, edge)) @@ -115,49 +115,49 @@ mod tests { let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); let edge1 = NewEdge::create( conn, - None, - None, - Some(sequence1_hash.clone()), - Some(0), + NewEdge::PATH_START_HASH.to_string(), + -123, + sequence1_hash.clone(), + 0, 0, 0, ); let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); let edge2 = NewEdge::create( conn, - Some(sequence1_hash.clone()), - Some(8), - Some(sequence2_hash.clone()), - Some(1), + sequence1_hash.clone(), + 8, + sequence2_hash.clone(), + 1, 0, 0, ); let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); let edge3 = NewEdge::create( conn, - Some(sequence2_hash.clone()), - Some(8), - Some(sequence3_hash.clone()), - Some(1), + sequence2_hash.clone(), + 8, + sequence3_hash.clone(), + 1, 0, 0, ); let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); let edge4 = NewEdge::create( conn, - Some(sequence3_hash.clone()), - Some(8), - Some(sequence4_hash.clone()), - Some(1), + sequence3_hash.clone(), + 8, + sequence4_hash.clone(), + 1, 0, 0, ); let edge5 = NewEdge::create( conn, - Some(sequence4_hash.clone()), - Some(8), - None, - None, + sequence4_hash.clone(), + 8, + NewEdge::PATH_END_HASH.to_string(), + -1, 0, 0, ); @@ -182,49 +182,49 @@ mod tests { let sequence1_hash = Sequence::create(conn, "DNA", "ATCGATCG", true); let edge5 = NewEdge::create( conn, - Some(sequence1_hash.clone()), - Some(0), - None, - None, + sequence1_hash.clone(), + 0, + NewEdge::PATH_END_HASH.to_string(), + -1, 0, 0, ); let sequence2_hash = Sequence::create(conn, "DNA", "AAAAAAAA", true); let edge4 = NewEdge::create( conn, - Some(sequence2_hash.clone()), - Some(1), - Some(sequence1_hash.clone()), - Some(7), + sequence2_hash.clone(), + 1, + sequence1_hash.clone(), + 7, 0, 0, ); let sequence3_hash = Sequence::create(conn, "DNA", "CCCCCCCC", true); let edge3 = NewEdge::create( conn, - Some(sequence3_hash.clone()), - Some(1), - Some(sequence2_hash.clone()), - Some(7), + sequence3_hash.clone(), + 1, + sequence2_hash.clone(), + 7, 0, 0, ); let sequence4_hash = Sequence::create(conn, "DNA", "GGGGGGGG", true); let edge2 = NewEdge::create( conn, - Some(sequence4_hash.clone()), - Some(1), - Some(sequence3_hash.clone()), - Some(7), + sequence4_hash.clone(), + 1, + sequence3_hash.clone(), + 7, 0, 0, ); let edge1 = NewEdge::create( conn, - None, - None, - Some(sequence4_hash.clone()), - Some(7), + NewEdge::PATH_START_HASH.to_string(), + -1, + sequence4_hash.clone(), + 7, 0, 0, ); From 3d1b48ca846a44e7781870f61c35a9821296181e Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 20 Aug 2024 21:20:50 -0400 Subject: [PATCH 09/18] Some cleanup --- src/models.rs | 22 ++++++++-------------- src/models/new_edge.rs | 2 +- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/models.rs b/src/models.rs index bf233fa..f6f3a86 100644 --- a/src/models.rs +++ b/src/models.rs @@ -16,7 +16,7 @@ pub mod sequence; use crate::graph::all_simple_paths; use crate::models::block::Block; use crate::models::edge::Edge; -use crate::models::new_edge::NewEdge; +use crate::models::new_edge::{EdgeData, NewEdge}; use crate::models::path::{NewBlock, Path, PathBlock}; use crate::models::sequence::Sequence; use crate::{get_overlap, models}; @@ -513,7 +513,7 @@ impl BlockGroup { new_block: &NewBlock, chromosome_index: i32, phased: i32, - ) -> Vec { + ) { // todo: // cases to check: // change that is the size of a block @@ -536,8 +536,7 @@ impl BlockGroup { if new_block.sequence_start == new_block.sequence_end { // Deletion - let new_edge = NewEdge { - id: 0, + let new_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), source_coordinate: start - start_block.path_start + start_block.sequence_start, target_hash: end_block.sequence.hash.clone(), @@ -548,8 +547,7 @@ impl BlockGroup { new_edges.push(new_edge); } else { // Insertion/replacement - let new_start_edge = NewEdge { - id: 0, + let new_start_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), source_coordinate: start - start_block.path_start + start_block.sequence_start, target_hash: new_block.sequence.hash.clone(), @@ -557,8 +555,7 @@ impl BlockGroup { chromosome_index, phased, }; - let new_end_edge = NewEdge { - id: 0, + let new_end_edge = EdgeData { source_hash: new_block.sequence.hash.clone(), source_coordinate: new_block.sequence_end, target_hash: end_block.sequence.hash.clone(), @@ -574,8 +571,7 @@ impl BlockGroup { // retrieve it as one node of the overall graph if start < start_block.path_end { let split_coordinate = start - start_block.path_start + start_block.sequence_start; - let new_split_start_edge = NewEdge { - id: 0, + let new_split_start_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), source_coordinate: split_coordinate, target_hash: start_block.sequence.hash.clone(), @@ -588,8 +584,7 @@ impl BlockGroup { if end > end_block.path_start { let split_coordinate = end - end_block.path_start + end_block.sequence_start; - let new_split_end_edge = NewEdge { - id: 0, + let new_split_end_edge = EdgeData { source_hash: new_block.sequence.hash.clone(), source_coordinate: split_coordinate, target_hash: end_block.sequence.hash.clone(), @@ -601,8 +596,7 @@ impl BlockGroup { new_edges.push(new_split_end_edge); } - // TODO: bulk create-or-get of new edges - new_edges + NewEdge::bulk_create(conn, new_edges); } } diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs index 57aa5fc..82f3be9 100644 --- a/src/models/new_edge.rs +++ b/src/models/new_edge.rs @@ -172,7 +172,7 @@ impl NewEdge { } let formatted_edge_rows_to_insert = edge_rows_to_insert.join(", "); - let insert_statement = format!("INSERT OR IGNORE INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES {0} RETURNING (id);", formatted_edge_rows_to_insert); + let insert_statement = format!("INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES {0} RETURNING (id);", formatted_edge_rows_to_insert); let mut stmt = conn.prepare(&insert_statement).unwrap(); let rows = stmt.query_map([], |row| row.get(0)).unwrap(); let mut edge_ids: Vec = vec![]; From 23c0c19857f555a0a3fab34025b724d0823bb31c Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 21 Aug 2024 11:16:35 -0400 Subject: [PATCH 10/18] Fix tests --- src/models/new_edge.rs | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs index 82f3be9..08ae5e7 100644 --- a/src/models/new_edge.rs +++ b/src/models/new_edge.rs @@ -1,6 +1,6 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::hash::RandomState; #[derive(Clone, Debug)] @@ -248,18 +248,20 @@ mod tests { let edges = NewEdge::bulk_load(conn, edge_ids); assert_eq!(edges.len(), 3); - let edge_result1 = &edges[0]; - assert_eq!(edge_result1.source_hash, NewEdge::PATH_START_HASH); + let edges_by_source_hash = edges + .into_iter() + .map(|edge| (edge.source_hash.clone(), edge)) + .collect::>(); + + let edge_result1 = edges_by_source_hash.get(NewEdge::PATH_START_HASH).unwrap(); assert_eq!(edge_result1.source_coordinate, -1); assert_eq!(edge_result1.target_hash, sequence1_hash); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = &edges[1]; - assert_eq!(edge_result2.source_hash, sequence1_hash); + let edge_result2 = edges_by_source_hash.get(&sequence1_hash).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); assert_eq!(edge_result2.target_hash, sequence2_hash); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = &edges[2]; - assert_eq!(edge_result3.source_hash, sequence2_hash); + let edge_result3 = edges_by_source_hash.get(&sequence2_hash).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); assert_eq!(edge_result3.target_hash, NewEdge::PATH_END_HASH); assert_eq!(edge_result3.target_coordinate, -1); @@ -316,21 +318,23 @@ mod tests { let edges = NewEdge::bulk_load(conn, edge_ids); assert_eq!(edges.len(), 3); - let edge_result1 = &edges[0]; + let edges_by_source_hash = edges + .into_iter() + .map(|edge| (edge.source_hash.clone(), edge)) + .collect::>(); + + let edge_result1 = edges_by_source_hash.get(NewEdge::PATH_START_HASH).unwrap(); assert_eq!(edge_result1.id, existing_edge.id); - assert_eq!(edge_result1.source_hash, NewEdge::PATH_START_HASH); assert_eq!(edge_result1.source_coordinate, -1); assert_eq!(edge_result1.target_hash, sequence1_hash); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = &edges[2]; - assert_eq!(edge_result2.source_hash, sequence1_hash); + let edge_result2 = edges_by_source_hash.get(&sequence1_hash).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); assert_eq!(edge_result2.target_hash, sequence2_hash); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = &edges[1]; - assert_eq!(edge_result3.source_hash, sequence2_hash); + let edge_result3 = edges_by_source_hash.get(&sequence2_hash).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); assert_eq!(edge_result3.target_hash, NewEdge::PATH_END_HASH); assert_eq!(edge_result3.target_coordinate, -1); From d57048ae480caba51989c7e19520b12219731501 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 21 Aug 2024 14:01:56 -0400 Subject: [PATCH 11/18] Change command line ops to use new code --- src/main.rs | 212 ++++++++++++++++++++++++++++++++++++++++++++- src/models.rs | 17 ++-- src/models/path.rs | 14 +-- 3 files changed, 225 insertions(+), 18 deletions(-) diff --git a/src/main.rs b/src/main.rs index bea2e9d..4062576 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,16 @@ use std::path::PathBuf; use bio::io::fasta; use gen::migrations::run_migrations; -use gen::models::{self, block::Block, edge::Edge, path::Path, sequence::Sequence, BlockGroup}; +use gen::models::{ + self, + block::Block, + edge::Edge, + new_edge::NewEdge, + path::{NewBlock, Path}, + path_edge::PathEdge, + sequence::Sequence, + BlockGroup, +}; use gen::{get_connection, parse_genotype}; use noodles::vcf; use noodles::vcf::variant::record::samples::series::value::genotype::Phasing; @@ -94,6 +103,49 @@ fn import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connection } } +fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connection) { + // TODO: support gz + let mut reader = fasta::Reader::from_file(fasta).unwrap(); + + if !models::Collection::exists(conn, name) { + let collection = models::Collection::create(conn, name); + + for result in reader.records() { + let record = result.expect("Error during fasta record parsing"); + let sequence = String::from_utf8(record.seq().to_vec()).unwrap(); + let seq_hash = Sequence::create(conn, "DNA", &sequence, !shallow); + let block_group = BlockGroup::create(conn, &collection.name, None, record.id()); + let edge_into = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + 0, + seq_hash.to_string(), + 0, + 0, + 0, + ); + let edge_out_of = NewEdge::create( + conn, + seq_hash.to_string(), + sequence.len() as i32, + NewEdge::PATH_END_HASH.to_string(), + 0, + 0, + 0, + ); + Path::new_create( + conn, + record.id(), + block_group.id, + vec![edge_into.id, edge_out_of.id], + ); + } + println!("Created it"); + } else { + println!("Collection {:1} already exists", name); + } +} + fn update_with_vcf( vcf_path: &String, collection_name: &String, @@ -233,6 +285,160 @@ fn update_with_vcf( } } +fn new_update_with_vcf( + vcf_path: &String, + collection_name: &String, + fixed_genotype: String, + fixed_sample: String, + conn: &mut Connection, +) { + run_migrations(conn); + + let mut reader = vcf::io::reader::Builder::default() + .build_from_path(vcf_path) + .expect("Unable to parse"); + let header = reader.read_header().unwrap(); + let sample_names = header.sample_names(); + for name in sample_names { + models::Sample::create(conn, name); + } + if !fixed_sample.is_empty() { + models::Sample::create(conn, &fixed_sample); + } + let mut genotype = vec![]; + if !fixed_genotype.is_empty() { + genotype = parse_genotype(&fixed_genotype); + } + + for result in reader.records() { + let record = result.unwrap(); + let seq_name = record.reference_sequence_name().to_string(); + let ref_allele = record.reference_bases(); + // this converts the coordinates to be zero based, start inclusive, end exclusive + let ref_start = record.variant_start().unwrap().unwrap().get() - 1; + let ref_end = record.variant_end(&header).unwrap().get(); + let alt_bases = record.alternate_bases(); + let alt_alleles: Vec<_> = alt_bases.iter().collect::>().unwrap(); + // TODO: fix this duplication of handling an insert + if !fixed_sample.is_empty() && !genotype.is_empty() { + for (chromosome_index, genotype) in genotype.iter().enumerate() { + if let Some(gt) = genotype { + if gt.allele != 0 { + let alt_seq = alt_alleles[chromosome_index - 1]; + let phased = match gt.phasing { + Phasing::Phased => 1, + Phasing::Unphased => 0, + }; + // TODO: new sequence may not be real and be or some sort. Handle these. + let new_sequence_hash = Sequence::create(conn, "DNA", alt_seq, true); + let sequences_by_hash = Sequence::sequences_by_hash( + conn, + vec![format!("\"{}\"", new_sequence_hash)], + ); + let sequence = sequences_by_hash.get(&new_sequence_hash).unwrap(); + let sample_bg_id = BlockGroup::get_or_create_sample_block_group( + conn, + collection_name, + &fixed_sample, + &seq_name, + ); + let sample_paths = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(sample_bg_id), + SQLValue::from(seq_name.clone()), + ], + ); + let new_block = NewBlock { + id: 0, + sequence: sequence.clone(), + block_sequence: alt_seq.to_string(), + sequence_start: 0, + sequence_end: alt_seq.len() as i32, + path_start: ref_start as i32, + path_end: ref_end as i32, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change( + conn, + &sample_paths[0], + ref_start as i32, + ref_end as i32, + &new_block, + chromosome_index as i32, + phased, + ); + } + } + } + } else { + for (sample_index, sample) in record.samples().iter().enumerate() { + let genotype = sample.get(&header, "GT"); + if genotype.is_some() { + if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() { + for (chromosome_index, gt) in genotypes.iter().enumerate() { + if gt.is_ok() { + let (allele, phasing) = gt.unwrap(); + let phased = match phasing { + Phasing::Phased => 1, + Phasing::Unphased => 0, + }; + let allele = allele.unwrap(); + if allele != 0 { + let alt_seq = alt_alleles[allele - 1]; + // TODO: new sequence may not be real and be or some sort. Handle these. + let new_sequence_hash = + Sequence::create(conn, "DNA", alt_seq, true); + let sequences_by_hash = Sequence::sequences_by_hash( + conn, + vec![format!("\"{}\"", new_sequence_hash)], + ); + let sequence = + sequences_by_hash.get(&new_sequence_hash).unwrap(); + let sample_bg_id = BlockGroup::get_or_create_sample_block_group( + conn, + collection_name, + &sample_names[sample_index], + &seq_name, + ); + let sample_paths = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(sample_bg_id), + SQLValue::from(seq_name.clone()), + ], + ); + let new_block = NewBlock { + id: 0, + sequence: sequence.clone(), + block_sequence: alt_seq.to_string(), + sequence_start: 0, + sequence_end: alt_seq.len() as i32, + path_start: ref_start as i32, + path_end: ref_end as i32, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change( + conn, + &sample_paths[0], + ref_start as i32, + ref_end as i32, + &new_block, + chromosome_index as i32, + phased, + ); + } + } + } + } + } + } + } + } +} + fn main() { let cli = Cli::parse(); @@ -242,7 +448,7 @@ fn main() { name, db, shallow, - }) => import_fasta(fasta, name, *shallow, &mut get_connection(db)), + }) => new_import_fasta(fasta, name, *shallow, &mut get_connection(db)), Some(Commands::Update { name, db, @@ -250,7 +456,7 @@ fn main() { vcf, genotype, sample, - }) => update_with_vcf( + }) => new_update_with_vcf( vcf, name, genotype.clone().unwrap_or("".to_string()), diff --git a/src/models.rs b/src/models.rs index f6f3a86..f800674 100644 --- a/src/models.rs +++ b/src/models.rs @@ -18,6 +18,7 @@ use crate::models::block::Block; use crate::models::edge::Edge; use crate::models::new_edge::{EdgeData, NewEdge}; use crate::models::path::{NewBlock, Path, PathBlock}; +use crate::models::path_edge::PathEdge; use crate::models::sequence::Sequence; use crate::{get_overlap, models}; @@ -217,11 +218,11 @@ impl BlockGroup { ); for path in existing_paths { - let mut new_blocks = vec![]; - for block in path.blocks { - new_blocks.push(*block_map.get(&block).unwrap()); - } - Path::create(conn, &path.name, target_block_group_id, new_blocks); + let edge_ids = PathEdge::edges_for(conn, path.id) + .into_iter() + .map(|edge| edge.id) + .collect(); + Path::new_create(conn, &path.name, target_block_group_id, edge_ids); } } @@ -507,7 +508,7 @@ impl BlockGroup { #[allow(clippy::too_many_arguments)] pub fn new_insert_change( conn: &mut Connection, - path: Path, + path: &Path, start: i32, end: i32, new_block: &NewBlock, @@ -567,8 +568,8 @@ impl BlockGroup { new_edges.push(new_end_edge); } - // NOTE: Add edges marking the existing "block" that is being substituted out, so we can - // retrieve it as one node of the overall graph + // NOTE: Add edges marking the existing part of the sequence that is being substituted out, + // so we can retrieve it as one node of the overall graph if start < start_block.path_end { let split_coordinate = start - start_block.path_start + start_block.sequence_start; let new_split_start_edge = EdgeData { diff --git a/src/models/path.rs b/src/models/path.rs index 8a302fb..99a3c71 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -8,7 +8,7 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use std::collections::{HashMap, HashSet}; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct Path { pub id: i32, pub name: String, @@ -141,7 +141,7 @@ impl Path { id: path_id, block_group_id: row.get(1)?, name: row.get(2)?, - blocks: PathBlock::get_blocks(conn, path_id), + blocks: vec![], }) }) .unwrap(); @@ -167,7 +167,7 @@ impl Path { } pub fn new_sequence(conn: &Connection, path: Path) -> String { - let blocks = Path::blocks_for(conn, path); + let blocks = Path::blocks_for(conn, &path); blocks .into_iter() .map(|block| block.block_sequence) @@ -220,7 +220,7 @@ impl Path { } } - pub fn blocks_for(conn: &Connection, path: Path) -> Vec { + pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { let edges = PathEdge::edges_for(conn, path.id); let mut sequence_hashes = HashSet::new(); for edge in &edges { @@ -244,7 +244,7 @@ impl Path { for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { let block = Path::edge_pairs_to_block( index as i32, - &path, + path, into, out_of, &sequences_by_hash, @@ -256,7 +256,7 @@ impl Path { blocks } - pub fn intervaltree_for(conn: &Connection, path: Path) -> IntervalTree { + pub fn intervaltree_for(conn: &Connection, path: &Path) -> IntervalTree { let blocks = Path::blocks_for(conn, path); let tree: IntervalTree = blocks .into_iter() @@ -545,7 +545,7 @@ mod tests { block_group.id, vec![edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - let tree = Path::intervaltree_for(conn, path); + let tree = Path::intervaltree_for(conn, &path); let blocks1: Vec<_> = tree.query_point(2).map(|x| x.value.clone()).collect(); assert_eq!(blocks1.len(), 1); let block1 = &blocks1[0]; From deb6622fa602b02e7e39740000e07f4ef94c16bc Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 21 Aug 2024 18:59:49 -0400 Subject: [PATCH 12/18] Re-implement get_all_sequences --- migrations/01-initial/up.sql | 9 ++ src/main.rs | 4 + src/models.rs | 199 ++++++++++++++++++++++++++++++++- src/models/block_group_edge.rs | 54 +++++++++ 4 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 src/models/block_group_edge.rs diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index 7abdfbf..90948e8 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -105,4 +105,13 @@ CREATE TABLE path_edges ( ); CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id); +CREATE TABLE block_group_edges ( + id INTEGER PRIMARY KEY NOT NULL, + block_group_id INTEGER NOT NULL, + edge_id INTEGER NOT NULL, + FOREIGN KEY(block_group_id) REFERENCES block_group(id), + FOREIGN KEY(edge_id) REFERENCES new_edges(id) +); +CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id); + INSERT INTO sequence (hash, sequence_type, sequence, "length") values ("yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", 64), ("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 64); diff --git a/src/main.rs b/src/main.rs index 4062576..a71bac6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,7 @@ use gen::migrations::run_migrations; use gen::models::{ self, block::Block, + block_group_edge::BlockGroupEdge, edge::Edge, new_edge::NewEdge, path::{NewBlock, Path}, @@ -133,6 +134,7 @@ fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connec 0, 0, ); + BlockGroupEdge::bulk_create(conn, block_group.id, vec![edge_into.id, edge_out_of.id]); Path::new_create( conn, record.id(), @@ -362,6 +364,7 @@ fn new_update_with_vcf( }; BlockGroup::new_insert_change( conn, + sample_bg_id, &sample_paths[0], ref_start as i32, ref_end as i32, @@ -422,6 +425,7 @@ fn new_update_with_vcf( }; BlockGroup::new_insert_change( conn, + sample_bg_id, &sample_paths[0], ref_start as i32, ref_end as i32, diff --git a/src/models.rs b/src/models.rs index f800674..a6e5024 100644 --- a/src/models.rs +++ b/src/models.rs @@ -1,3 +1,4 @@ +use itertools::Itertools; use petgraph::graphmap::DiGraphMap; use petgraph::Direction; use rusqlite::types::Value; @@ -7,6 +8,7 @@ use std::collections::{HashMap, HashSet}; use std::fmt::*; pub mod block; +pub mod block_group_edge; pub mod edge; pub mod new_edge; pub mod path; @@ -15,6 +17,7 @@ pub mod sequence; use crate::graph::all_simple_paths; use crate::models::block::Block; +use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::Edge; use crate::models::new_edge::{EdgeData, NewEdge}; use crate::models::path::{NewBlock, Path, PathBlock}; @@ -95,6 +98,21 @@ pub struct BlockGroup { pub name: String, } +#[derive(Clone)] +pub struct GroupBlock { + pub id: i32, + pub sequence_hash: String, + pub sequence: String, + pub start: i32, + pub end: i32, +} + +#[derive(Eq, Hash, PartialEq)] +pub struct BlockKey { + pub sequence_hash: String, + pub coordinate: i32, +} + impl BlockGroup { pub fn create( conn: &Connection, @@ -339,6 +357,183 @@ impl BlockGroup { sequences } + pub fn blocks_from_edges(conn: &Connection, edges: Vec) -> Vec { + let mut sequence_hashes = HashSet::new(); + for edge in &edges { + if edge.source_hash != NewEdge::PATH_START_HASH { + sequence_hashes.insert(edge.source_hash.clone()); + } + if edge.target_hash != NewEdge::PATH_END_HASH { + sequence_hashes.insert(edge.target_hash.clone()); + } + } + + let mut boundary_edges_by_hash = HashMap::>::new(); + for edge in edges { + if (edge.source_hash == edge.target_hash) + && (edge.target_coordinate == edge.source_coordinate + 1) + { + boundary_edges_by_hash + .entry(edge.source_hash.clone()) + .and_modify(|current_edges| current_edges.push(edge.clone())) + .or_insert_with(|| vec![edge.clone()]); + } + } + + let sequences_by_hash = Sequence::sequences_by_hash( + conn, + sequence_hashes + .into_iter() + .map(|hash| format!("\"{hash}\"")) + .collect(), + ); + let mut blocks = vec![]; + + let mut block_index = 0; + for (hash, sequence) in sequences_by_hash.into_iter() { + let sequence_edges = boundary_edges_by_hash.get(&hash).unwrap(); + let sorted_sequence_edges: Vec = sequence_edges + .iter() + .sorted_by(|edge1, edge2| { + Ord::cmp(&edge1.source_coordinate, &edge2.source_coordinate) + }) + .cloned() + .collect(); + let first_edge = sorted_sequence_edges[0].clone(); + let start = 0; + let end = first_edge.source_coordinate; + let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + let first_block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(first_block); + block_index += 1; + for (into, out_of) in sorted_sequence_edges.clone().into_iter().tuple_windows() { + let start = into.target_coordinate; + let end = out_of.source_coordinate; + let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + let block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(block); + block_index += 1; + } + let last_edge = &sorted_sequence_edges[sorted_sequence_edges.len() - 1]; + let start = last_edge.target_coordinate; + let end = sequence.sequence.len() as i32; + let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + let last_block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(last_block); + block_index += 1; + } + + blocks + } + + pub fn new_get_all_sequences(conn: &Connection, block_group_id: i32) -> HashSet { + let edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id); + let blocks = BlockGroup::blocks_from_edges(conn, edges.clone()); + + let blocks_by_start = blocks + .clone() + .into_iter() + .map(|block| { + ( + BlockKey { + sequence_hash: block.sequence_hash, + coordinate: block.start, + }, + block.id, + ) + }) + .collect::>(); + let blocks_by_end = blocks + .clone() + .into_iter() + .map(|block| { + ( + BlockKey { + sequence_hash: block.sequence_hash, + coordinate: block.end, + }, + block.id, + ) + }) + .collect::>(); + let blocks_by_id = blocks + .clone() + .into_iter() + .map(|block| (block.id, block)) + .collect::>(); + + let mut graph: DiGraphMap = DiGraphMap::new(); + for block in blocks { + graph.add_node(block.id); + } + for edge in edges { + let source_key = BlockKey { + sequence_hash: edge.source_hash, + coordinate: edge.source_coordinate, + }; + let source_id = blocks_by_end.get(&source_key).unwrap(); + let target_key = BlockKey { + sequence_hash: edge.target_hash, + coordinate: edge.target_coordinate, + }; + let target_id = blocks_by_start.get(&target_key).unwrap(); + graph.add_edge(*source_id, *target_id, ()); + } + let mut start_nodes = vec![]; + let mut end_nodes = vec![]; + for node in graph.nodes() { + let has_incoming = graph.neighbors_directed(node, Direction::Incoming).next(); + let has_outgoing = graph.neighbors_directed(node, Direction::Outgoing).next(); + if has_incoming.is_none() { + start_nodes.push(node); + } + if has_outgoing.is_none() { + end_nodes.push(node); + } + } + let mut sequences = HashSet::::new(); + + for start_node in start_nodes { + for end_node in &end_nodes { + // TODO: maybe make all_simple_paths return a single path id where start == end + if start_node == *end_node { + let block = blocks_by_id.get(&start_node).unwrap(); + sequences.insert(block.sequence.clone()); + } else { + for path in all_simple_paths(&graph, start_node, *end_node) { + let mut current_sequence = "".to_string(); + for node in path { + let block = blocks_by_id.get(&node).unwrap(); + let block_sequence = block.sequence.clone(); + current_sequence.push_str(&block_sequence); + } + sequences.insert(current_sequence); + } + } + } + } + + sequences + } + #[allow(clippy::ptr_arg)] #[allow(clippy::too_many_arguments)] pub fn insert_change( @@ -508,6 +703,7 @@ impl BlockGroup { #[allow(clippy::too_many_arguments)] pub fn new_insert_change( conn: &mut Connection, + block_group_id: i32, path: &Path, start: i32, end: i32, @@ -597,7 +793,8 @@ impl BlockGroup { new_edges.push(new_split_end_edge); } - NewEdge::bulk_create(conn, new_edges); + let edge_ids = NewEdge::bulk_create(conn, new_edges); + BlockGroupEdge::bulk_create(conn, block_group_id, edge_ids); } } diff --git a/src/models/block_group_edge.rs b/src/models/block_group_edge.rs new file mode 100644 index 0000000..9a19675 --- /dev/null +++ b/src/models/block_group_edge.rs @@ -0,0 +1,54 @@ +use crate::models::new_edge::NewEdge; +use rusqlite::types::Value; +use rusqlite::{params_from_iter, Connection}; + +#[derive(Clone, Debug)] +pub struct BlockGroupEdge { + pub id: i32, + pub block_group_id: i32, + pub edge_id: i32, +} + +impl BlockGroupEdge { + pub fn bulk_create(conn: &Connection, block_group_id: i32, edge_ids: Vec) { + let mut rows_to_insert = vec![]; + for edge_id in edge_ids { + let row = format!("({0}, {1})", block_group_id, edge_id); + rows_to_insert.push(row); + } + let formatted_rows_to_insert = rows_to_insert.join(", "); + + let insert_statement = format!( + "INSERT OR IGNORE INTO block_group_edges (block_group_id, edge_id) VALUES {0};", + formatted_rows_to_insert + ); + let _ = conn.execute(&insert_statement, ()); + } + + pub fn edges_for_block_group(conn: &Connection, block_group_id: i32) -> Vec { + let query = format!( + "select * from block_group_edges where block_group_id = {};", + block_group_id + ); + let block_group_edges = BlockGroupEdge::query(conn, &query, vec![]); + let edge_ids = block_group_edges + .into_iter() + .map(|block_group_edge| block_group_edge.edge_id) + .collect(); + NewEdge::bulk_load(conn, edge_ids) + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params_from_iter(placeholders), |row| { + Ok(BlockGroupEdge { + id: row.get(0)?, + block_group_id: row.get(1)?, + edge_id: row.get(2)?, + }) + }) + .unwrap(); + rows.map(|row| row.unwrap()).collect() + } +} From 91eb838b1cbbf2f5f5f328e594f3c469d3a3f0d2 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 21 Aug 2024 21:55:24 -0400 Subject: [PATCH 13/18] Add tests, fix bugs --- src/models.rs | 241 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 194 insertions(+), 47 deletions(-) diff --git a/src/models.rs b/src/models.rs index a6e5024..7d1d8e0 100644 --- a/src/models.rs +++ b/src/models.rs @@ -371,7 +371,7 @@ impl BlockGroup { let mut boundary_edges_by_hash = HashMap::>::new(); for edge in edges { if (edge.source_hash == edge.target_hash) - && (edge.target_coordinate == edge.source_coordinate + 1) + && (edge.target_coordinate == edge.source_coordinate) { boundary_edges_by_hash .entry(edge.source_hash.clone()) @@ -391,56 +391,68 @@ impl BlockGroup { let mut block_index = 0; for (hash, sequence) in sequences_by_hash.into_iter() { - let sequence_edges = boundary_edges_by_hash.get(&hash).unwrap(); - let sorted_sequence_edges: Vec = sequence_edges - .iter() - .sorted_by(|edge1, edge2| { - Ord::cmp(&edge1.source_coordinate, &edge2.source_coordinate) - }) - .cloned() - .collect(); - let first_edge = sorted_sequence_edges[0].clone(); - let start = 0; - let end = first_edge.source_coordinate; - let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); - let first_block = GroupBlock { - id: block_index, - sequence_hash: hash.clone(), - sequence: block_sequence, - start, - end, - }; - blocks.push(first_block); - block_index += 1; - for (into, out_of) in sorted_sequence_edges.clone().into_iter().tuple_windows() { - let start = into.target_coordinate; - let end = out_of.source_coordinate; + let sequence_edges = boundary_edges_by_hash.get(&hash); + if sequence_edges.is_some() { + let sorted_sequence_edges: Vec = sequence_edges + .unwrap() + .iter() + .sorted_by(|edge1, edge2| { + Ord::cmp(&edge1.source_coordinate, &edge2.source_coordinate) + }) + .cloned() + .collect(); + let first_edge = sorted_sequence_edges[0].clone(); + let start = 0; + let end = first_edge.source_coordinate; + let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + let first_block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(first_block); + block_index += 1; + for (into, out_of) in sorted_sequence_edges.clone().into_iter().tuple_windows() { + let start = into.target_coordinate; + let end = out_of.source_coordinate; + let block_sequence = + sequence.sequence[start as usize..end as usize].to_string(); + let block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: block_sequence, + start, + end, + }; + blocks.push(block); + block_index += 1; + } + let last_edge = &sorted_sequence_edges[sorted_sequence_edges.len() - 1]; + let start = last_edge.target_coordinate; + let end = sequence.sequence.len() as i32; let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); - let block = GroupBlock { + let last_block = GroupBlock { id: block_index, sequence_hash: hash.clone(), sequence: block_sequence, start, end, }; - blocks.push(block); + blocks.push(last_block); + block_index += 1; + } else { + blocks.push(GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + sequence: sequence.sequence.clone(), + start: 0, + end: sequence.sequence.len() as i32, + }); block_index += 1; } - let last_edge = &sorted_sequence_edges[sorted_sequence_edges.len() - 1]; - let start = last_edge.target_coordinate; - let end = sequence.sequence.len() as i32; - let block_sequence = sequence.sequence[start as usize..end as usize].to_string(); - let last_block = GroupBlock { - id: block_index, - sequence_hash: hash.clone(), - sequence: block_sequence, - start, - end, - }; - blocks.push(last_block); - block_index += 1; } - blocks } @@ -489,14 +501,19 @@ impl BlockGroup { sequence_hash: edge.source_hash, coordinate: edge.source_coordinate, }; - let source_id = blocks_by_end.get(&source_key).unwrap(); + let source_id = blocks_by_end.get(&source_key); let target_key = BlockKey { sequence_hash: edge.target_hash, coordinate: edge.target_coordinate, }; - let target_id = blocks_by_start.get(&target_key).unwrap(); - graph.add_edge(*source_id, *target_id, ()); + let target_id = blocks_by_start.get(&target_key); + if let Some(source_id_value) = source_id { + if let Some(target_id_value) = target_id { + graph.add_edge(*source_id_value, *target_id_value, ()); + } + } } + let mut start_nodes = vec![]; let mut end_nodes = vec![]; for node in graph.nodes() { @@ -772,7 +789,7 @@ impl BlockGroup { source_hash: start_block.sequence.hash.clone(), source_coordinate: split_coordinate, target_hash: start_block.sequence.hash.clone(), - target_coordinate: split_coordinate + 1, + target_coordinate: split_coordinate, chromosome_index, phased, }; @@ -782,10 +799,10 @@ impl BlockGroup { if end > end_block.path_start { let split_coordinate = end - end_block.path_start + end_block.sequence_start; let new_split_end_edge = EdgeData { - source_hash: new_block.sequence.hash.clone(), + source_hash: end_block.sequence.hash.clone(), source_coordinate: split_coordinate, target_hash: end_block.sequence.hash.clone(), - target_coordinate: split_coordinate + 1, + target_coordinate: split_coordinate, chromosome_index, phased, }; @@ -1132,4 +1149,134 @@ mod tests { ]) ); } + + fn setup_multipath(conn: &Connection) -> (i32, Path) { + let a_seq_hash = Sequence::create(conn, "DNA", "AAAAAAAAAA", true); + let t_seq_hash = Sequence::create(conn, "DNA", "TTTTTTTTTT", true); + let c_seq_hash = Sequence::create(conn, "DNA", "CCCCCCCCCC", true); + let g_seq_hash = Sequence::create(conn, "DNA", "GGGGGGGGGG", true); + let _collection = Collection::create(conn, "test"); + let block_group = BlockGroup::create(conn, "test", None, "hg19"); + let edge0 = NewEdge::create( + conn, + NewEdge::PATH_START_HASH.to_string(), + 0, + a_seq_hash.clone(), + 0, + 0, + 0, + ); + let edge1 = NewEdge::create(conn, a_seq_hash, 10, t_seq_hash.clone(), 0, 0, 0); + let edge2 = NewEdge::create(conn, t_seq_hash, 10, c_seq_hash.clone(), 0, 0, 0); + let edge3 = NewEdge::create(conn, c_seq_hash, 10, g_seq_hash.clone(), 0, 0, 0); + let edge4 = NewEdge::create( + conn, + g_seq_hash, + 10, + NewEdge::PATH_END_HASH.to_string(), + 0, + 0, + 0, + ); + BlockGroupEdge::bulk_create( + conn, + block_group.id, + vec![edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], + ); + let path = Path::new_create( + conn, + "chr1", + block_group.id, + vec![edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], + ); + (block_group.id, path) + } + + #[test] + fn insert_and_deletion_new_get_all() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 7, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + + let deletion_sequence_hash = Sequence::create(&conn, "DNA", "", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", deletion_sequence_hash)]); + let deletion_sequence = sequences_by_hash.get(&deletion_sequence_hash).unwrap(); + let deletion = NewBlock { + id: 0, + sequence: deletion_sequence.clone(), + block_sequence: deletion_sequence.sequence.clone(), + sequence_start: 0, + sequence_end: 0, + path_start: 19, + path_end: 31, + strand: "+".to_string(), + }; + + // take out an entire block. + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 19, 31, &deletion, 1, 0); + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTTTTTGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTGGGGGGGGG".to_string(), + ]) + ) + } + + #[test] + fn simple_insert_new_get_all() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 7, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } } From d07edda697d58a9ae57f9cd9b4e46093eb65ad24 Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 22 Aug 2024 16:30:31 -0400 Subject: [PATCH 14/18] More unit tests, more fixes --- src/models.rs | 306 +++++++++++++++++++++++++++++++++++++++-- src/models/new_edge.rs | 7 +- 2 files changed, 304 insertions(+), 9 deletions(-) diff --git a/src/models.rs b/src/models.rs index 7d1d8e0..2010a01 100644 --- a/src/models.rs +++ b/src/models.rs @@ -718,6 +718,7 @@ impl BlockGroup { #[allow(clippy::ptr_arg)] #[allow(clippy::too_many_arguments)] + #[allow(clippy::needless_late_init)] pub fn new_insert_change( conn: &mut Connection, block_group_id: i32, @@ -728,19 +729,26 @@ impl BlockGroup { chromosome_index: i32, phased: i32, ) { - // todo: - // cases to check: - // change that is the size of a block - // change that goes over multiple blocks - // change that hits just start/end boundary, e.g. block is 1,5 and change is 3,5 or 1,3. - // change that deletes block boundary - let tree = Path::intervaltree_for(conn, path); let start_blocks: Vec = tree.query_point(start).map(|x| x.value.clone()).collect(); assert_eq!(start_blocks.len(), 1); - let start_block = &start_blocks[0]; + // NOTE: This may not be used but needs to be initialized here instead of inside the if + // statement that uses it, so that the borrow checker is happy + let previous_start_blocks: Vec = tree + .query_point(start - 1) + .map(|x| x.value.clone()) + .collect(); + assert_eq!(previous_start_blocks.len(), 1); + let start_block; + if start_blocks[0].path_start == start { + // First part of this block will be replaced/deleted, need to get previous block to add + // edge including it + start_block = &previous_start_blocks[0]; + } else { + start_block = &start_blocks[0]; + } let end_blocks: Vec = tree.query_point(end).map(|x| x.value.clone()).collect(); assert_eq!(end_blocks.len(), 1); @@ -1279,4 +1287,286 @@ mod tests { ]) ); } + + #[test] + fn insert_on_block_boundary_middle_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 15, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 15, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTNNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_within_block_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 12, + path_end: 17, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 12, 17, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTNNNNTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_on_block_boundary_start_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 10, + path_end: 10, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 10, 10, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAANNNNTTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_on_block_boundary_end_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 9, + path_end: 9, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 9, 9, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAANNNNATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_across_entire_block_boundary_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 10, + path_end: 20, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 10, 20, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAANNNNCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_across_two_blocks_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 15, + path_end: 25, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 15, 25, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTNNNNCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_spanning_blocks_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 5, + path_end: 35, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 5, 35, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAANNNNGGGGG".to_string() + ]) + ); + } + + #[test] + fn simple_deletion_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let deletion_sequence_hash = Sequence::create(&conn, "DNA", "", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", deletion_sequence_hash)]); + let deletion_sequence = sequences_by_hash.get(&deletion_sequence_hash).unwrap(); + let deletion = NewBlock { + id: 0, + sequence: deletion_sequence.clone(), + block_sequence: deletion_sequence.sequence.clone(), + sequence_start: 0, + sequence_end: 0, + path_start: 19, + path_end: 31, + strand: "+".to_string(), + }; + + // take out an entire block. + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 19, 31, &deletion, 1, 0); + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTTTTTGGGGGGGGG".to_string(), + ]) + ) + } + + #[test] + fn doesnt_apply_same_insert_twice_new() { + let mut conn = get_connection(); + let (block_group_id, path) = setup_multipath(&conn); + let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); + let sequences_by_hash = + Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); + let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert = NewBlock { + id: 0, + sequence: insert_sequence.clone(), + block_sequence: insert_sequence.sequence[0..4].to_string(), + sequence_start: 0, + sequence_end: 4, + path_start: 7, + path_end: 15, + strand: "+".to_string(), + }; + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + + BlockGroup::new_insert_change(&mut conn, block_group_id, &path, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::new_get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } } diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs index 08ae5e7..e3c56ff 100644 --- a/src/models/new_edge.rs +++ b/src/models/new_edge.rs @@ -41,7 +41,7 @@ impl NewEdge { ) -> NewEdge { let query = "INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6) RETURNING *"; let id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and target_hash = ?3 and target_coordinate = ?4 and chromosome_index = ?5 and phased = ?6"; - let mut placeholders: Vec = vec![ + let placeholders: Vec = vec![ source_hash.clone().into(), source_coordinate.into(), target_hash.clone().into(), @@ -170,6 +170,11 @@ impl NewEdge { ); edge_rows_to_insert.push(edge_row); } + + if edge_rows_to_insert.is_empty() { + return existing_edge_ids; + } + let formatted_edge_rows_to_insert = edge_rows_to_insert.join(", "); let insert_statement = format!("INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES {0} RETURNING (id);", formatted_edge_rows_to_insert); From 1d85d74942fa2ed6e48a5fd939af53cfd3b80e2c Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 26 Aug 2024 10:09:13 -0400 Subject: [PATCH 15/18] Rework sequence lookup --- src/main.rs | 14 +++------- src/models.rs | 61 ++++++++++++------------------------------ src/models/path.rs | 9 ++----- src/models/sequence.rs | 11 +++++++- 4 files changed, 33 insertions(+), 62 deletions(-) diff --git a/src/main.rs b/src/main.rs index a71bac6..35b6d62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -333,11 +333,8 @@ fn new_update_with_vcf( }; // TODO: new sequence may not be real and be or some sort. Handle these. let new_sequence_hash = Sequence::create(conn, "DNA", alt_seq, true); - let sequences_by_hash = Sequence::sequences_by_hash( - conn, - vec![format!("\"{}\"", new_sequence_hash)], - ); - let sequence = sequences_by_hash.get(&new_sequence_hash).unwrap(); + let sequence = + Sequence::sequence_from_hash(conn, &new_sequence_hash).unwrap(); let sample_bg_id = BlockGroup::get_or_create_sample_block_group( conn, collection_name, @@ -393,12 +390,9 @@ fn new_update_with_vcf( // TODO: new sequence may not be real and be or some sort. Handle these. let new_sequence_hash = Sequence::create(conn, "DNA", alt_seq, true); - let sequences_by_hash = Sequence::sequences_by_hash( - conn, - vec![format!("\"{}\"", new_sequence_hash)], - ); let sequence = - sequences_by_hash.get(&new_sequence_hash).unwrap(); + Sequence::sequence_from_hash(conn, &new_sequence_hash) + .unwrap(); let sample_bg_id = BlockGroup::get_or_create_sample_block_group( conn, collection_name, diff --git a/src/models.rs b/src/models.rs index 2010a01..0872b38 100644 --- a/src/models.rs +++ b/src/models.rs @@ -296,7 +296,7 @@ impl BlockGroup { } let sequence_hashes = block_map .values() - .map(|block| format!("\"{id}\"", id = block.sequence_hash)) + .map(|block| block.sequence_hash.clone()) .collect::>(); let sequence_map = Sequence::sequences_by_hash(conn, sequence_hashes); let block_ids = block_map @@ -380,13 +380,8 @@ impl BlockGroup { } } - let sequences_by_hash = Sequence::sequences_by_hash( - conn, - sequence_hashes - .into_iter() - .map(|hash| format!("\"{hash}\"")) - .collect(), - ); + let sequences_by_hash = + Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect::>()); let mut blocks = vec![]; let mut block_index = 0; @@ -1205,9 +1200,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1230,9 +1223,8 @@ mod tests { ); let deletion_sequence_hash = Sequence::create(&conn, "DNA", "", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", deletion_sequence_hash)]); - let deletion_sequence = sequences_by_hash.get(&deletion_sequence_hash).unwrap(); + let deletion_sequence = + Sequence::sequence_from_hash(&conn, &deletion_sequence_hash).unwrap(); let deletion = NewBlock { id: 0, sequence: deletion_sequence.clone(), @@ -1263,9 +1255,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1293,9 +1283,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1323,9 +1311,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1353,9 +1339,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1383,9 +1367,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1413,9 +1395,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1443,9 +1423,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1473,9 +1451,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), @@ -1503,9 +1479,8 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let deletion_sequence_hash = Sequence::create(&conn, "DNA", "", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", deletion_sequence_hash)]); - let deletion_sequence = sequences_by_hash.get(&deletion_sequence_hash).unwrap(); + let deletion_sequence = + Sequence::sequence_from_hash(&conn, &deletion_sequence_hash).unwrap(); let deletion = NewBlock { id: 0, sequence: deletion_sequence.clone(), @@ -1534,9 +1509,7 @@ mod tests { let mut conn = get_connection(); let (block_group_id, path) = setup_multipath(&conn); let insert_sequence_hash = Sequence::create(&conn, "DNA", "NNNN", true); - let sequences_by_hash = - Sequence::sequences_by_hash(&conn, vec![format!("\"{}\"", insert_sequence_hash)]); - let insert_sequence = sequences_by_hash.get(&insert_sequence_hash).unwrap(); + let insert_sequence = Sequence::sequence_from_hash(&conn, &insert_sequence_hash).unwrap(); let insert = NewBlock { id: 0, sequence: insert_sequence.clone(), diff --git a/src/models/path.rs b/src/models/path.rs index 99a3c71..518dfa4 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -231,13 +231,8 @@ impl Path { sequence_hashes.insert(edge.target_hash.clone()); } } - let sequences_by_hash = Sequence::sequences_by_hash( - conn, - sequence_hashes - .into_iter() - .map(|hash| format!("\"{hash}\"")) - .collect(), - ); + let sequences_by_hash = + Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect()); let mut blocks = vec![]; let mut path_length = 0; diff --git a/src/models/sequence.rs b/src/models/sequence.rs index ce6dd65..708e668 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -66,7 +66,11 @@ impl Sequence { } pub fn sequences_by_hash(conn: &Connection, hashes: Vec) -> HashMap { - let joined_hashes = &hashes.join(","); + let joined_hashes = &hashes + .into_iter() + .map(|hash| format!("\"{}\"", hash)) + .collect::>() + .join(","); let sequences = Sequence::sequences( conn, &format!("select * from sequence where hash in ({0})", joined_hashes), @@ -77,4 +81,9 @@ impl Sequence { .map(|sequence| (sequence.hash.clone(), sequence)) .collect::>() } + + pub fn sequence_from_hash(conn: &Connection, hash: &str) -> Option { + let sequences_by_hash = Sequence::sequences_by_hash(conn, vec![hash.to_string()]); + sequences_by_hash.get(hash).cloned() + } } From 8d5ab5755c86b4849eb9eef7561173c1f63af420 Mon Sep 17 00:00:00 2001 From: chris Mitchell Date: Mon, 26 Aug 2024 10:51:36 -0400 Subject: [PATCH 16/18] Switch to noodle fasta parser --- Cargo.lock | 577 +--------------------------------------------------- Cargo.toml | 3 +- src/main.rs | 39 ++-- 3 files changed, 25 insertions(+), 594 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12cb1b5..4e6b015 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,15 +29,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] - [[package]] name = "anstream" version = "0.6.15" @@ -87,21 +78,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "anyhow" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" - -[[package]] -name = "approx" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" -dependencies = [ - "num-traits", -] - [[package]] name = "async-compression" version = "0.4.12" @@ -136,79 +112,12 @@ dependencies = [ "rustc-demangle", ] -[[package]] -name = "bio" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8cbd545253762ecf9ef741f2c49f07c06a0ce4d041d74ee9c3f1ce0e2d5446e" -dependencies = [ - "anyhow", - "approx", - "bio-types", - "bit-set", - "bv", - "bytecount", - "csv", - "custom_derive", - "editdistancek", - "enum-map", - "fxhash", - "itertools", - "itertools-num", - "lazy_static", - "multimap", - "ndarray", - "newtype_derive", - "num-integer", - "num-traits", - "ordered-float", - "petgraph", - "rand", - "regex", - "serde", - "serde_derive", - "statrs", - "strum", - "strum_macros", - "thiserror", - "triple_accel", - "vec_map", -] - -[[package]] -name = "bio-types" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc7edd677651969cc262a8dfb870f0c2266c3ceeaf863d742982e39699ff460" -dependencies = [ - "derive-new", - "lazy_static", - "regex", - "strum_macros", - "thiserror", -] - -[[package]] -name = "bit-set" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" -dependencies = [ - "bit-vec 0.8.0", -] - [[package]] name = "bit-vec" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22" -[[package]] -name = "bit-vec" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" - [[package]] name = "bitflags" version = "2.6.0" @@ -234,28 +143,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bv" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340" -dependencies = [ - "feature-probe", - "serde", -] - -[[package]] -name = "bytecount" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" - -[[package]] -name = "bytemuck" -version = "1.16.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" - [[package]] name = "byteorder" version = "1.5.0" @@ -390,44 +277,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "csv" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" -dependencies = [ - "memchr", -] - -[[package]] -name = "custom_derive" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" - -[[package]] -name = "derive-new" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "digest" version = "0.10.7" @@ -438,38 +287,12 @@ dependencies = [ "crypto-common", ] -[[package]] -name = "editdistancek" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e02df23d5b1c6f9e69fa603b890378123b93073df998a21e6e33b9db0a32613" - [[package]] name = "either" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" -[[package]] -name = "enum-map" -version = "2.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9" -dependencies = [ - "enum-map-derive", -] - -[[package]] -name = "enum-map-derive" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -488,12 +311,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" -[[package]] -name = "feature-probe" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" - [[package]] name = "fixedbitset" version = "0.4.2" @@ -575,20 +392,10 @@ dependencies = [ "slab", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "gen" version = "0.1.0" dependencies = [ - "bio", "clap", "include_dir", "intervaltree", @@ -610,17 +417,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "getrandom" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - [[package]] name = "gimli" version = "0.29.0" @@ -704,27 +500,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools-num" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7" -dependencies = [ - "num-traits", -] - -[[package]] -name = "itoa" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "lexical-core" version = "0.8.5" @@ -795,12 +570,6 @@ version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" -[[package]] -name = "libm" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" - [[package]] name = "libsqlite3-sys" version = "0.28.0" @@ -829,16 +598,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "matrixmultiply" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" -dependencies = [ - "autocfg", - "rawpointer", -] - [[package]] name = "md-5" version = "0.10.6" @@ -864,66 +623,6 @@ dependencies = [ "adler", ] -[[package]] -name = "multimap" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" -dependencies = [ - "serde", -] - -[[package]] -name = "nalgebra" -version = "0.32.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4" -dependencies = [ - "approx", - "matrixmultiply", - "nalgebra-macros", - "num-complex", - "num-rational", - "num-traits", - "rand", - "rand_distr", - "simba", - "typenum", -] - -[[package]] -name = "nalgebra-macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "ndarray" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" -dependencies = [ - "matrixmultiply", - "num-complex", - "num-integer", - "num-traits", - "rawpointer", -] - -[[package]] -name = "newtype_derive" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" -dependencies = [ - "rustc_version", -] - [[package]] name = "noodles" version = "0.78.0" @@ -933,6 +632,7 @@ dependencies = [ "noodles-bam", "noodles-bcf", "noodles-bgzf", + "noodles-core", "noodles-cram", "noodles-csi", "noodles-fasta", @@ -949,7 +649,7 @@ version = "0.65.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "406d4768f21c73e3075c0c0d77a5b21bc8b8169c8f0963122607cc410427b727" dependencies = [ - "bit-vec 0.7.0", + "bit-vec", "bstr", "byteorder", "bytes", @@ -1034,7 +734,7 @@ version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4bc8001c54f1d8e47e1ac6041a5f27edc99b68bacea3fade9c89059de285aea" dependencies = [ - "bit-vec 0.7.0", + "bit-vec", "byteorder", "indexmap", "noodles-bgzf", @@ -1106,7 +806,7 @@ version = "0.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "545e16e229b7f8734b0a2a36bd4c98a5b70128663b16b5201ddadc0d09c28d4a" dependencies = [ - "bit-vec 0.7.0", + "bit-vec", "byteorder", "indexmap", "noodles-bgzf", @@ -1133,44 +833,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", - "libm", -] - [[package]] name = "object" version = "0.36.2" @@ -1186,21 +848,6 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "ordered-float" -version = "4.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a91171844676f8c7990ce64959210cd2eaef32c2612c50f9fae9f8aaa6065a6" -dependencies = [ - "num-traits", -] - -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - [[package]] name = "percent-encoding" version = "2.3.1" @@ -1235,15 +882,6 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" -[[package]] -name = "ppv-lite86" -version = "0.2.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" -dependencies = [ - "zerocopy", -] - [[package]] name = "proc-macro2" version = "1.0.86" @@ -1262,81 +900,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_distr" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" -dependencies = [ - "num-traits", - "rand", -] - -[[package]] -name = "rawpointer" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" - -[[package]] -name = "regex" -version = "1.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" - [[package]] name = "rusqlite" version = "0.31.0" @@ -1368,42 +931,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc_version" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" -dependencies = [ - "semver", -] - -[[package]] -name = "rustversion" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" - -[[package]] -name = "ryu" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" - -[[package]] -name = "safe_arch" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3460605018fdc9612bce72735cba0d27efbcd9904780d44c7e3a9948f96148a" -dependencies = [ - "bytemuck", -] - -[[package]] -name = "semver" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" - [[package]] name = "serde" version = "1.0.204" @@ -1435,19 +962,6 @@ dependencies = [ "digest", ] -[[package]] -name = "simba" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae" -dependencies = [ - "approx", - "num-complex", - "num-traits", - "paste", - "wide", -] - [[package]] name = "slab" version = "0.4.9" @@ -1469,43 +983,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "statrs" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f697a07e4606a0a25c044de247e583a330dbb1731d11bc7350b81f48ad567255" -dependencies = [ - "approx", - "nalgebra", - "num-traits", - "rand", -] - [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn", -] - [[package]] name = "syn" version = "2.0.72" @@ -1517,26 +1000,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "thiserror" -version = "1.0.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "tokio" version = "1.39.2" @@ -1561,12 +1024,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "triple_accel" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" - [[package]] name = "typenum" version = "1.17.0" @@ -1591,37 +1048,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" -dependencies = [ - "serde", -] - [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "wide" -version = "0.7.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "901e8597c777fa042e9e245bd56c0dc4418c5db3f845b6ff94fbac732c6a0692" -dependencies = [ - "bytemuck", - "safe_arch", -] - [[package]] name = "windows-sys" version = "0.52.0" @@ -1710,7 +1142,6 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ - "byteorder", "zerocopy-derive", ] diff --git a/Cargo.toml b/Cargo.toml index 1d08dce..c560b9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -bio = "2.0.0" clap = { version = "4.5.8", features = ["derive"] } include_dir = "0.7.4" intervaltree = "0.2.7" @@ -12,5 +11,5 @@ itertools = "0.13.0" rusqlite = { version = "0.31.0", features = ["bundled", "array"] } rusqlite_migration = { version = "1.2.0" , features = ["from-directory"]} sha2 = "0.10.8" -noodles = { version = "0.78.0", features = ["vcf", "fasta", "async"] } +noodles = { version = "0.78.0", features = ["core", "vcf", "fasta", "async"] } petgraph = "0.6.5" diff --git a/src/main.rs b/src/main.rs index a71bac6..1d02dcc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,8 +3,8 @@ use clap::{Parser, Subcommand}; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::path::PathBuf; +use std::{io, str}; -use bio::io::fasta; use gen::migrations::run_migrations; use gen::models::{ self, @@ -18,6 +18,7 @@ use gen::models::{ BlockGroup, }; use gen::{get_connection, parse_genotype}; +use noodles::fasta; use noodles::vcf; use noodles::vcf::variant::record::samples::series::value::genotype::Phasing; use noodles::vcf::variant::record::samples::series::Value; @@ -25,7 +26,6 @@ use noodles::vcf::variant::record::samples::{Sample, Series}; use noodles::vcf::variant::record::{AlternateBases, ReferenceBases, Samples}; use noodles::vcf::variant::Record; use rusqlite::{types::Value as SQLValue, Connection}; -use std::io; #[derive(Parser)] #[command(version, about, long_about = None)] @@ -76,27 +76,24 @@ enum Commands { fn import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connection) { // TODO: support gz - let mut reader = fasta::Reader::from_file(fasta).unwrap(); + let mut reader = fasta::io::reader::Builder.build_from_path(fasta).unwrap(); if !models::Collection::exists(conn, name) { let collection = models::Collection::create(conn, name); for result in reader.records() { let record = result.expect("Error during fasta record parsing"); - let sequence = String::from_utf8(record.seq().to_vec()).unwrap(); + let sequence = str::from_utf8(record.sequence().as_ref()) + .unwrap() + .to_string(); + let name = String::from_utf8(record.name().to_vec()).unwrap(); + let sequence_length = record.sequence().len() as i32; let seq_hash = Sequence::create(conn, "DNA", &sequence, !shallow); - let block_group = BlockGroup::create(conn, &collection.name, None, record.id()); - let block = Block::create( - conn, - &seq_hash, - block_group.id, - 0, - (sequence.len() as i32), - "+", - ); + let block_group = BlockGroup::create(conn, &collection.name, None, &name); + let block = Block::create(conn, &seq_hash, block_group.id, 0, sequence_length, "+"); Edge::create(conn, None, Some(block.id), 0, 0); Edge::create(conn, Some(block.id), None, 0, 0); - Path::create(conn, record.id(), block_group.id, vec![block.id]); + Path::create(conn, &name, block_group.id, vec![block.id]); } println!("Created it"); } else { @@ -106,16 +103,20 @@ fn import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connection fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connection) { // TODO: support gz - let mut reader = fasta::Reader::from_file(fasta).unwrap(); + let mut reader = fasta::io::reader::Builder.build_from_path(fasta).unwrap(); if !models::Collection::exists(conn, name) { let collection = models::Collection::create(conn, name); for result in reader.records() { let record = result.expect("Error during fasta record parsing"); - let sequence = String::from_utf8(record.seq().to_vec()).unwrap(); + let sequence = str::from_utf8(record.sequence().as_ref()) + .unwrap() + .to_string(); + let name = String::from_utf8(record.name().to_vec()).unwrap(); + let sequence_length = record.sequence().len() as i32; let seq_hash = Sequence::create(conn, "DNA", &sequence, !shallow); - let block_group = BlockGroup::create(conn, &collection.name, None, record.id()); + let block_group = BlockGroup::create(conn, &collection.name, None, &name); let edge_into = NewEdge::create( conn, NewEdge::PATH_START_HASH.to_string(), @@ -128,7 +129,7 @@ fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connec let edge_out_of = NewEdge::create( conn, seq_hash.to_string(), - sequence.len() as i32, + sequence_length, NewEdge::PATH_END_HASH.to_string(), 0, 0, @@ -137,7 +138,7 @@ fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connec BlockGroupEdge::bulk_create(conn, block_group.id, vec![edge_into.id, edge_out_of.id]); Path::new_create( conn, - record.id(), + &name, block_group.id, vec![edge_into.id, edge_out_of.id], ); From a0c591607a7f0b1a14e7fab961576cc929632b58 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 26 Aug 2024 10:51:50 -0400 Subject: [PATCH 17/18] Add strand info to new edges --- migrations/01-initial/up.sql | 4 +- src/main.rs | 4 ++ src/models.rs | 50 +++++++++++++++++++++++-- src/models/new_edge.rs | 71 ++++++++++++++++++++++++++++-------- src/models/path.rs | 29 +++++++++++---- src/models/path_edge.rs | 38 ++++++++++++++----- 6 files changed, 160 insertions(+), 36 deletions(-) diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index 90948e8..a760603 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -85,15 +85,17 @@ CREATE TABLE new_edges ( id INTEGER PRIMARY KEY NOT NULL, source_hash TEXT NOT NULL, source_coordinate INTEGER NOT NULL, + source_strand TEXT NOT NULL, target_hash TEXT NOT NULL, target_coordinate INTEGER NOT NULL, + target_strand TEXT NOT NULL, chromosome_index INTEGER NOT NULL, phased INTEGER NOT NULL, FOREIGN KEY(source_hash) REFERENCES sequence(hash), FOREIGN KEY(target_hash) REFERENCES sequence(hash), constraint chk_phased check (phased in (0, 1)) ); -CREATE UNIQUE INDEX new_edge_uidx ON new_edges(source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased); +CREATE UNIQUE INDEX new_edge_uidx ON new_edges(source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased); CREATE TABLE path_edges ( id INTEGER PRIMARY KEY NOT NULL, diff --git a/src/main.rs b/src/main.rs index 35b6d62..1f7891e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -120,8 +120,10 @@ fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connec conn, NewEdge::PATH_START_HASH.to_string(), 0, + "+".to_string(), seq_hash.to_string(), 0, + "+".to_string(), 0, 0, ); @@ -129,8 +131,10 @@ fn new_import_fasta(fasta: &String, name: &str, shallow: bool, conn: &mut Connec conn, seq_hash.to_string(), sequence.len() as i32, + "+".to_string(), NewEdge::PATH_END_HASH.to_string(), 0, + "+".to_string(), 0, 0, ); diff --git a/src/models.rs b/src/models.rs index 0872b38..fc7ea2f 100644 --- a/src/models.rs +++ b/src/models.rs @@ -756,8 +756,10 @@ impl BlockGroup { let new_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), source_coordinate: start - start_block.path_start + start_block.sequence_start, + source_strand: "+".to_string(), target_hash: end_block.sequence.hash.clone(), target_coordinate: end - end_block.path_start + end_block.sequence_start, + target_strand: "+".to_string(), chromosome_index, phased, }; @@ -767,16 +769,20 @@ impl BlockGroup { let new_start_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), source_coordinate: start - start_block.path_start + start_block.sequence_start, + source_strand: "+".to_string(), target_hash: new_block.sequence.hash.clone(), target_coordinate: new_block.sequence_start, + target_strand: "+".to_string(), chromosome_index, phased, }; let new_end_edge = EdgeData { source_hash: new_block.sequence.hash.clone(), source_coordinate: new_block.sequence_end, + source_strand: "+".to_string(), target_hash: end_block.sequence.hash.clone(), target_coordinate: end - end_block.path_start + end_block.sequence_start, + target_strand: "+".to_string(), chromosome_index, phased, }; @@ -791,8 +797,10 @@ impl BlockGroup { let new_split_start_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), source_coordinate: split_coordinate, + source_strand: "+".to_string(), target_hash: start_block.sequence.hash.clone(), target_coordinate: split_coordinate, + target_strand: "+".to_string(), chromosome_index, phased, }; @@ -804,8 +812,10 @@ impl BlockGroup { let new_split_end_edge = EdgeData { source_hash: end_block.sequence.hash.clone(), source_coordinate: split_coordinate, + source_strand: "+".to_string(), target_hash: end_block.sequence.hash.clone(), target_coordinate: split_coordinate, + target_strand: "+".to_string(), chromosome_index, phased, }; @@ -1164,20 +1174,54 @@ mod tests { conn, NewEdge::PATH_START_HASH.to_string(), 0, + "+".to_string(), a_seq_hash.clone(), 0, + "+".to_string(), + 0, + 0, + ); + let edge1 = NewEdge::create( + conn, + a_seq_hash, + 10, + "+".to_string(), + t_seq_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let edge2 = NewEdge::create( + conn, + t_seq_hash, + 10, + "+".to_string(), + c_seq_hash.clone(), + 0, + "+".to_string(), + 0, + 0, + ); + let edge3 = NewEdge::create( + conn, + c_seq_hash, + 10, + "+".to_string(), + g_seq_hash.clone(), + 0, + "+".to_string(), 0, 0, ); - let edge1 = NewEdge::create(conn, a_seq_hash, 10, t_seq_hash.clone(), 0, 0, 0); - let edge2 = NewEdge::create(conn, t_seq_hash, 10, c_seq_hash.clone(), 0, 0, 0); - let edge3 = NewEdge::create(conn, c_seq_hash, 10, g_seq_hash.clone(), 0, 0, 0); let edge4 = NewEdge::create( conn, g_seq_hash, 10, + "+".to_string(), NewEdge::PATH_END_HASH.to_string(), 0, + "+".to_string(), 0, 0, ); diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs index e3c56ff..4c0736f 100644 --- a/src/models/new_edge.rs +++ b/src/models/new_edge.rs @@ -1,5 +1,5 @@ use rusqlite::types::Value; -use rusqlite::{params_from_iter, Connection}; +use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; use std::collections::{HashMap, HashSet}; use std::hash::RandomState; @@ -8,8 +8,10 @@ pub struct NewEdge { pub id: i32, pub source_hash: String, pub source_coordinate: i32, + pub source_strand: String, pub target_hash: String, pub target_coordinate: i32, + pub target_strand: String, pub chromosome_index: i32, pub phased: i32, } @@ -18,8 +20,10 @@ pub struct NewEdge { pub struct EdgeData { pub source_hash: String, pub source_coordinate: i32, + pub source_strand: String, pub target_hash: String, pub target_coordinate: i32, + pub target_strand: String, pub chromosome_index: i32, pub phased: i32, } @@ -30,22 +34,27 @@ impl NewEdge { pub const PATH_END_HASH: &'static str = "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"; + #[allow(clippy::too_many_arguments)] pub fn create( conn: &Connection, source_hash: String, source_coordinate: i32, + source_strand: String, target_hash: String, target_coordinate: i32, + target_strand: String, chromosome_index: i32, phased: i32, ) -> NewEdge { - let query = "INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6) RETURNING *"; - let id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and target_hash = ?3 and target_coordinate = ?4 and chromosome_index = ?5 and phased = ?6"; + let query = "INSERT INTO new_edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8) RETURNING *"; + let id_query = "select id from new_edges where source_hash = ?1 and source_coordinate = ?2 and source_strand = ?3 and target_hash = ?4 and target_coordinate = ?5 and target_strand = ?6 and chromosome_index = ?7 and phased = ?8"; let placeholders: Vec = vec![ source_hash.clone().into(), source_coordinate.into(), + source_strand.clone().into(), target_hash.clone().into(), target_coordinate.into(), + target_strand.clone().into(), chromosome_index.into(), phased.into(), ]; @@ -56,10 +65,12 @@ impl NewEdge { id: row.get(0)?, source_hash: row.get(1)?, source_coordinate: row.get(2)?, - target_hash: row.get(3)?, - target_coordinate: row.get(4)?, - chromosome_index: row.get(5)?, - phased: row.get(6)?, + source_strand: row.get(3)?, + target_hash: row.get(4)?, + target_coordinate: row.get(5)?, + target_strand: row.get(6)?, + chromosome_index: row.get(7)?, + phased: row.get(8)?, }) }) { Ok(edge) => edge, @@ -72,8 +83,10 @@ impl NewEdge { .unwrap(), source_hash, source_coordinate, + source_strand, target_hash, target_coordinate, + target_strand, chromosome_index, phased, } @@ -93,7 +106,7 @@ impl NewEdge { .map(|edge_id| edge_id.to_string()) .collect::>() .join(","); - let query = format!("select id, source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased from new_edges where id in ({});", formatted_edge_ids); + let query = format!("select id, source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased from new_edges where id in ({});", formatted_edge_ids); NewEdge::query(conn, &query, vec![]) } @@ -105,10 +118,12 @@ impl NewEdge { id: row.get(0)?, source_hash: row.get(1)?, source_coordinate: row.get(2)?, - target_hash: row.get(3)?, - target_coordinate: row.get(4)?, - chromosome_index: row.get(5)?, - phased: row.get(6)?, + source_strand: row.get(3)?, + target_hash: row.get(4)?, + target_coordinate: row.get(5)?, + target_strand: row.get(6)?, + chromosome_index: row.get(7)?, + phased: row.get(8)?, }) }) .unwrap(); @@ -123,13 +138,17 @@ impl NewEdge { let mut edge_rows = vec![]; for edge in &edges { let source_hash = format!("\"{0}\"", edge.source_hash); + let source_strand = format!("\"{0}\"", edge.source_strand); let target_hash = format!("\"{0}\"", edge.target_hash); + let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( - "({0}, {1}, {2}, {3}, {4}, {5})", + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", source_hash, edge.source_coordinate, + source_strand, target_hash, edge.target_coordinate, + target_strand, edge.chromosome_index, edge.phased ); @@ -137,7 +156,7 @@ impl NewEdge { } let formatted_edge_rows = edge_rows.join(", "); - let select_statement = format!("SELECT * FROM new_edges WHERE (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) in ({0});", formatted_edge_rows); + let select_statement = format!("SELECT * FROM new_edges WHERE (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); let existing_edges = NewEdge::query(conn, &select_statement, vec![]); let mut existing_edge_ids: Vec = existing_edges .clone() @@ -159,12 +178,16 @@ impl NewEdge { for edge in edges_to_insert { let source_hash = format!("\"{0}\"", edge.source_hash); let target_hash = format!("\"{0}\"", edge.target_hash); + let source_strand = format!("\"{0}\"", edge.source_strand); + let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( - "({0}, {1}, {2}, {3}, {4}, {5})", + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", source_hash, edge.source_coordinate, + source_strand, target_hash, edge.target_coordinate, + target_strand, edge.chromosome_index, edge.phased ); @@ -177,7 +200,7 @@ impl NewEdge { let formatted_edge_rows_to_insert = edge_rows_to_insert.join(", "); - let insert_statement = format!("INSERT INTO new_edges (source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased) VALUES {0} RETURNING (id);", formatted_edge_rows_to_insert); + let insert_statement = format!("INSERT INTO new_edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING (id);", formatted_edge_rows_to_insert); let mut stmt = conn.prepare(&insert_statement).unwrap(); let rows = stmt.query_map([], |row| row.get(0)).unwrap(); let mut edge_ids: Vec = vec![]; @@ -193,8 +216,10 @@ impl NewEdge { EdgeData { source_hash: edge.source_hash, source_coordinate: edge.source_coordinate, + source_strand: edge.source_strand, target_hash: edge.target_hash, target_coordinate: edge.target_coordinate, + target_strand: edge.target_strand, chromosome_index: edge.chromosome_index, phased: edge.phased, } @@ -225,8 +250,10 @@ mod tests { let edge1 = EdgeData { source_hash: NewEdge::PATH_START_HASH.to_string(), source_coordinate: -1, + source_strand: "+".to_string(), target_hash: sequence1_hash.clone(), target_coordinate: 1, + target_strand: "+".to_string(), chromosome_index: 0, phased: 0, }; @@ -234,16 +261,20 @@ mod tests { let edge2 = EdgeData { source_hash: sequence1_hash.clone(), source_coordinate: 2, + source_strand: "+".to_string(), target_hash: sequence2_hash.clone(), target_coordinate: 3, + target_strand: "+".to_string(), chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { source_hash: sequence2_hash.clone(), source_coordinate: 4, + source_strand: "+".to_string(), target_hash: NewEdge::PATH_END_HASH.to_string(), target_coordinate: -1, + target_strand: "+".to_string(), chromosome_index: 0, phased: 0, }; @@ -282,8 +313,10 @@ mod tests { conn, NewEdge::PATH_START_HASH.to_string(), -1, + "+".to_string(), sequence1_hash.clone(), 1, + "+".to_string(), 0, 0, ); @@ -295,8 +328,10 @@ mod tests { let edge1 = EdgeData { source_hash: NewEdge::PATH_START_HASH.to_string(), source_coordinate: -1, + source_strand: "+".to_string(), target_hash: sequence1_hash.clone(), target_coordinate: 1, + target_strand: "+".to_string(), chromosome_index: 0, phased: 0, }; @@ -304,16 +339,20 @@ mod tests { let edge2 = EdgeData { source_hash: sequence1_hash.clone(), source_coordinate: 2, + source_strand: "+".to_string(), target_hash: sequence2_hash.clone(), target_coordinate: 3, + target_strand: "+".to_string(), chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { source_hash: sequence2_hash.clone(), source_coordinate: 4, + source_strand: "+".to_string(), target_hash: NewEdge::PATH_END_HASH.to_string(), target_coordinate: -1, + target_strand: "+".to_string(), chromosome_index: 0, phased: 0, }; diff --git a/src/models/path.rs b/src/models/path.rs index 518dfa4..b1ac76d 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -195,19 +195,24 @@ impl Path { let end = out_of.source_coordinate; let strand; - let block_sequence; let block_sequence_length; - if end >= start { - strand = "+"; - block_sequence = sequence.sequence[start as usize..end as usize].to_string(); + if into.target_strand == out_of.source_strand { + strand = into.target_strand; block_sequence_length = end - start; } else { - strand = "-"; - block_sequence = revcomp(&sequence.sequence[end as usize..start as usize + 1]); - block_sequence_length = start - end; + panic!( + "Edge pair with target_strand/source_strand mismatch for path {}", + path.id + ); } + let block_sequence = if strand == "-" { + revcomp(&sequence.sequence[start as usize..end as usize]) + } else { + sequence.sequence[start as usize..end as usize].to_string() + }; + NewBlock { id: block_id, sequence: sequence.clone(), @@ -489,8 +494,10 @@ mod tests { conn, NewEdge::PATH_START_HASH.to_string(), -1, + "+".to_string(), sequence1_hash.clone(), 0, + "+".to_string(), 0, 0, ); @@ -499,8 +506,10 @@ mod tests { conn, sequence1_hash.clone(), 8, + "+".to_string(), sequence2_hash.clone(), 1, + "+".to_string(), 0, 0, ); @@ -509,8 +518,10 @@ mod tests { conn, sequence2_hash.clone(), 8, + "+".to_string(), sequence3_hash.clone(), 1, + "+".to_string(), 0, 0, ); @@ -519,8 +530,10 @@ mod tests { conn, sequence3_hash.clone(), 8, + "+".to_string(), sequence4_hash.clone(), 1, + "+".to_string(), 0, 0, ); @@ -528,8 +541,10 @@ mod tests { conn, sequence4_hash.clone(), 8, + "+".to_string(), NewEdge::PATH_END_HASH.to_string(), -1, + "+".to_string(), 0, 0, ); diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index d6ede1d..f7205fe 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -117,8 +117,10 @@ mod tests { conn, NewEdge::PATH_START_HASH.to_string(), -123, + "+".to_string(), sequence1_hash.clone(), 0, + "+".to_string(), 0, 0, ); @@ -127,8 +129,10 @@ mod tests { conn, sequence1_hash.clone(), 8, + "+".to_string(), sequence2_hash.clone(), 1, + "+".to_string(), 0, 0, ); @@ -137,8 +141,10 @@ mod tests { conn, sequence2_hash.clone(), 8, + "+".to_string(), sequence3_hash.clone(), 1, + "+".to_string(), 0, 0, ); @@ -147,8 +153,10 @@ mod tests { conn, sequence3_hash.clone(), 8, + "+".to_string(), sequence4_hash.clone(), 1, + "+".to_string(), 0, 0, ); @@ -156,8 +164,10 @@ mod tests { conn, sequence4_hash.clone(), 8, + "+".to_string(), NewEdge::PATH_END_HASH.to_string(), -1, + "+".to_string(), 0, 0, ); @@ -183,9 +193,11 @@ mod tests { let edge5 = NewEdge::create( conn, sequence1_hash.clone(), - 0, + 8, + "-".to_string(), NewEdge::PATH_END_HASH.to_string(), - -1, + 0, + "-".to_string(), 0, 0, ); @@ -193,9 +205,11 @@ mod tests { let edge4 = NewEdge::create( conn, sequence2_hash.clone(), - 1, - sequence1_hash.clone(), 7, + "-".to_string(), + sequence1_hash.clone(), + 0, + "-".to_string(), 0, 0, ); @@ -203,9 +217,11 @@ mod tests { let edge3 = NewEdge::create( conn, sequence3_hash.clone(), - 1, - sequence2_hash.clone(), 7, + "-".to_string(), + sequence2_hash.clone(), + 0, + "-".to_string(), 0, 0, ); @@ -213,9 +229,11 @@ mod tests { let edge2 = NewEdge::create( conn, sequence4_hash.clone(), - 1, - sequence3_hash.clone(), 7, + "-".to_string(), + sequence3_hash.clone(), + 0, + "-".to_string(), 0, 0, ); @@ -223,8 +241,10 @@ mod tests { conn, NewEdge::PATH_START_HASH.to_string(), -1, + "-".to_string(), sequence4_hash.clone(), - 7, + 0, + "-".to_string(), 0, 0, ); From 3ba3754858f4083753af88411b5d41190d4cb2a7 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 26 Aug 2024 10:56:14 -0400 Subject: [PATCH 18/18] Clearer constant names --- migrations/01-initial/up.sql | 2 +- src/models/new_edge.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index a760603..13a6c70 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -116,4 +116,4 @@ CREATE TABLE block_group_edges ( ); CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id); -INSERT INTO sequence (hash, sequence_type, sequence, "length") values ("yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", 64), ("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 64); +INSERT INTO sequence (hash, sequence_type, sequence, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 64); diff --git a/src/models/new_edge.rs b/src/models/new_edge.rs index 4c0736f..4ee3c8a 100644 --- a/src/models/new_edge.rs +++ b/src/models/new_edge.rs @@ -30,9 +30,9 @@ pub struct EdgeData { impl NewEdge { pub const PATH_START_HASH: &'static str = - "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; + "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; pub const PATH_END_HASH: &'static str = - "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"; + "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"; #[allow(clippy::too_many_arguments)] pub fn create(