diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 7e5b6be..0d2a0dd 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -85,7 +85,7 @@ CREATE TABLE path_edges ( FOREIGN KEY(edge_id) REFERENCES edges(id), FOREIGN KEY(path_id) REFERENCES path(id) ) STRICT; -CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id); +CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id, index_in_path); CREATE TABLE block_group_edges ( id INTEGER PRIMARY KEY NOT NULL, diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index e0bf431..44dde5d 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -1,4 +1,5 @@ use itertools::Itertools; +use petgraph::prelude::DiGraphMap; use rusqlite::Connection; use std::collections::{HashMap, HashSet}; use std::fs; @@ -13,6 +14,7 @@ use crate::models::{ collection::Collection, edge::{Edge, GroupBlock}, path::Path, + path_edge::PathEdge, sequence::Sequence, strand::Strand, }; @@ -32,7 +34,7 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) let (graph, edges_by_node_pair) = Edge::build_graph(conn, &edges, &blocks); - let mut file = File::create(filename).unwrap(); + let file = File::create(filename).unwrap(); let mut writer = BufWriter::new(file); let mut terminal_block_ids = HashSet::new(); @@ -43,6 +45,33 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) terminal_block_ids.insert(block.id); continue; } + } + + write_segments(&mut writer, blocks.clone(), terminal_block_ids.clone()); + write_links( + &mut writer, + graph, + edges_by_node_pair.clone(), + terminal_block_ids, + ); + write_paths( + &mut writer, + conn, + collection_name, + edges_by_node_pair, + &blocks, + ); +} + +fn write_segments( + writer: &mut BufWriter, + blocks: Vec, + terminal_block_ids: HashSet, +) { + for block in &blocks { + if terminal_block_ids.contains(&block.id) { + continue; + } writer .write_all(&segment_line(&block.sequence, block.id as usize).into_bytes()) .unwrap_or_else(|_| { @@ -52,13 +81,18 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) ) }); } +} - let blocks_by_id = blocks - .clone() - .into_iter() - .map(|block| (block.id, block)) - .collect::>(); +fn segment_line(sequence: &str, index: usize) -> String { + format!("S\t{}\t{}\t{}\n", index + 1, sequence, "*") +} +fn write_links( + writer: &mut BufWriter, + graph: DiGraphMap, + edges_by_node_pair: HashMap<(i32, i32), Edge>, + terminal_block_ids: HashSet, +) { for (source, target, ()) in graph.all_edges() { if terminal_block_ids.contains(&source) || terminal_block_ids.contains(&target) { continue; @@ -77,10 +111,6 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) } } -fn segment_line(sequence: &str, index: usize) -> String { - format!("S\t{}\t{}\t{}\n", index, sequence, "*") -} - fn link_line( source_index: i32, source_strand: Strand, @@ -89,10 +119,100 @@ fn link_line( ) -> String { format!( "L\t{}\t{}\t{}\t{}\t*\n", - source_index, source_strand, target_index, target_strand + source_index + 1, + source_strand, + target_index + 1, + target_strand ) } +// NOTE: A path is an immutable list of edges, but the sequence between the target of one edge and +// the source of the next may be "split" by later operations that add edges with sources or targets +// on a sequence that are in between those of a consecutive pair of edges in a path. This function +// handles that case by collecting all the nodes between the target of one edge and the source of +// the next. +fn nodes_for_edges( + edge1: &Edge, + edge2: &Edge, + blocks_by_hash_and_start: &HashMap<(&str, i32), GroupBlock>, + blocks_by_hash_and_end: &HashMap<(&str, i32), GroupBlock>, +) -> Vec { + let current_block = blocks_by_hash_and_start + .get(&(edge1.target_hash.as_str(), edge1.target_coordinate)) + .unwrap(); + let end_block = blocks_by_hash_and_end + .get(&(edge2.source_hash.as_str(), edge2.source_coordinate)) + .unwrap(); + let mut node_ids = vec![]; + #[allow(clippy::while_immutable_condition)] + while current_block.id != end_block.id { + node_ids.push(current_block.id); + let current_block = blocks_by_hash_and_start + .get(&(current_block.sequence_hash.as_str(), current_block.end)) + .unwrap(); + } + node_ids.push(end_block.id); + + node_ids +} + +fn write_paths( + writer: &mut BufWriter, + conn: &Connection, + collection_name: &str, + edges_by_node_pair: HashMap<(i32, i32), Edge>, + blocks: &[GroupBlock], +) { + let paths = Path::get_paths_for_collection(conn, collection_name); + let edges_by_path_id = + PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect()); + let node_pairs_by_edge_id = edges_by_node_pair + .iter() + .map(|(node_pair, edge)| (edge.id, *node_pair)) + .collect::>(); + + let blocks_by_hash_and_start = blocks + .iter() + .map(|block| ((block.sequence_hash.as_str(), block.start), block.clone())) + .collect::>(); + let blocks_by_hash_and_end = blocks + .iter() + .map(|block| ((block.sequence_hash.as_str(), block.end), block.clone())) + .collect::>(); + + for path in paths { + let edges_for_path = edges_by_path_id.get(&path.id).unwrap(); + let mut node_ids = vec![]; + let mut node_strands = vec![]; + for (edge1, edge2) in edges_for_path.iter().tuple_windows() { + let current_node_ids = nodes_for_edges( + edge1, + edge2, + &blocks_by_hash_and_start, + &blocks_by_hash_and_end, + ); + for node_id in ¤t_node_ids { + node_ids.push(*node_id); + node_strands.push(edge1.target_strand); + } + } + + writer + .write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes()) + .unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name)); + } +} + +fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> String { + let nodes = node_ids + .iter() + .zip(node_strands.iter()) + .map(|(node_id, node_strand)| format!("{}{}", *node_id + 1, node_strand)) + .collect::>() + .join(","); + format!("P\t{}\t{}\n", path_name, nodes) +} + mod tests { use rusqlite::Connection; // Note this useful idiom: importing names from outer (for mod tests) scope. @@ -103,7 +223,7 @@ mod tests { block_group::BlockGroup, block_group_edge::BlockGroupEdge, collection::Collection, edge::Edge, sequence::Sequence, }; - use crate::test_helpers::get_connection; + use crate::test_helpers::{get_connection, setup_gen_dir}; use tempfile::tempdir; #[test] @@ -112,7 +232,7 @@ mod tests { let conn = get_connection(None); let collection_name = "test collection"; - let collection = Collection::create(&conn, collection_name); + Collection::create(&conn, collection_name); let block_group = BlockGroup::create(&conn, collection_name, None, "test block group"); let sequence1 = Sequence::new() .sequence_type("DNA") @@ -192,6 +312,14 @@ mod tests { block_group.id, &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); + + Path::create( + &conn, + "1234", + block_group.id, + &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); @@ -208,5 +336,90 @@ mod tests { let all_sequences2 = BlockGroup::get_all_sequences(&conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); + + let paths = Path::get_paths_for_collection(&conn, "test collection 2"); + assert_eq!(paths.len(), 1); + assert_eq!(Path::sequence(&conn, paths[0].clone()), "AAAATTTTGGGGCCCC"); + } + + #[test] + fn test_simple_round_trip() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/simple.gfa"); + let collection_name = "test".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + + let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); + let mut gfa_path = PathBuf::from(temp_dir.path()); + gfa_path.push("intermediate.gfa"); + + export_gfa(conn, &collection_name, &gfa_path); + import_gfa(&gfa_path, "test collection 2", conn); + + let block_group2 = Collection::get_block_groups(conn, "test collection 2") + .pop() + .unwrap(); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + + assert_eq!(all_sequences, all_sequences2); + } + + #[test] + fn test_anderson_round_trip() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/anderson_promoters.gfa"); + let collection_name = "anderson promoters".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + + let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); + let mut gfa_path = PathBuf::from(temp_dir.path()); + gfa_path.push("intermediate.gfa"); + + export_gfa(conn, &collection_name, &gfa_path); + import_gfa(&gfa_path, "anderson promoters 2", conn); + + let block_group2 = Collection::get_block_groups(conn, "anderson promoters 2") + .pop() + .unwrap(); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + + assert_eq!(all_sequences, all_sequences2); + } + + #[test] + fn test_reverse_strand_round_trip() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/reverse_strand.gfa"); + let collection_name = "test".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + + let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); + let mut gfa_path = PathBuf::from(temp_dir.path()); + gfa_path.push("intermediate.gfa"); + + export_gfa(conn, &collection_name, &gfa_path); + import_gfa(&gfa_path, "test collection 2", conn); + + let block_group2 = Collection::get_block_groups(conn, "test collection 2") + .pop() + .unwrap(); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + + assert_eq!(all_sequences, all_sequences2); } } diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 82a84cf..e3dc8cd 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -302,4 +302,33 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "TATGCCAGCTGCGAATA"); } + + #[test] + fn test_import_anderson_promoters() { + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/anderson_promoters.gfa"); + let collection_name = "anderson promoters".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let paths = Path::get_paths_for_collection(conn, &collection_name); + assert_eq!(paths.len(), 20); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let path = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(block_group_id), + SQLValue::from("BBa_J23100".to_string()), + ], + )[0] + .clone(); + + let result = Path::sequence(conn, path); + let expected_sequence_parts = vec!["T", "T", "G", "A", "C", "G", "GCTAGCTCAG", "T", "CCT", "A", "GG", "T", "A", "C", "A", "G", + "TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"]; + let expected_sequence = expected_sequence_parts.join(""); + assert_eq!(result, expected_sequence); + } } diff --git a/src/models/block_group.rs b/src/models/block_group.rs index ccd98eb..654c25f 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -160,7 +160,7 @@ impl BlockGroup { BlockGroupEdge::bulk_create(conn, target_block_group_id, &edge_ids); for path in existing_paths { - let edge_ids = PathEdge::edges_for(conn, path.id) + let edge_ids = PathEdge::edges_for_path(conn, path.id) .into_iter() .map(|edge| edge.id) .collect::>(); diff --git a/src/models/collection.rs b/src/models/collection.rs index e60185a..1b65856 100644 --- a/src/models/collection.rs +++ b/src/models/collection.rs @@ -2,6 +2,7 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use crate::models::block_group::BlockGroup; +use crate::models::path::Path; #[derive(Clone, Debug)] pub struct Collection { @@ -15,6 +16,7 @@ impl Collection { .unwrap(); stmt.exists([name]).unwrap() } + pub fn create(conn: &Connection, name: &str) -> Collection { let mut stmt = conn .prepare("INSERT INTO collection (name) VALUES (?1) RETURNING *") diff --git a/src/models/path.rs b/src/models/path.rs index 5932ed6..5600f61 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -119,6 +119,11 @@ impl Path { paths } + pub fn get_paths_for_collection(conn: &Connection, collection_name: &str) -> Vec { + let query = "SELECT * FROM path JOIN block_group ON path.block_group_id = block_group.id WHERE block_group.collection_name = ?1"; + Path::get_paths(conn, query, vec![Value::from(collection_name.to_string())]) + } + pub fn sequence(conn: &Connection, path: Path) -> String { let blocks = Path::blocks_for(conn, &path); blocks @@ -179,7 +184,7 @@ impl Path { } pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { - let edges = PathEdge::edges_for(conn, path.id); + let edges = PathEdge::edges_for_path(conn, path.id); let mut sequence_hashes = HashSet::new(); for edge in &edges { if edge.source_hash != Sequence::PATH_START_HASH { diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index a7a67a5..750d9b0 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -1,8 +1,10 @@ -use crate::models::edge::Edge; +use itertools::Itertools; use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use std::collections::HashMap; +use crate::models::edge::Edge; + #[derive(Clone, Debug)] pub struct PathEdge { pub id: i32, @@ -72,7 +74,7 @@ impl PathEdge { objs } - pub fn edges_for(conn: &Connection, path_id: i32) -> Vec { + pub fn edges_for_path(conn: &Connection, path_id: i32) -> Vec { let path_edges = PathEdge::query( conn, "select * from path_edges where path_id = ?1 order by index_in_path ASC", @@ -92,6 +94,48 @@ impl PathEdge { .map(|edge_id| edges_by_id[&edge_id].clone()) .collect::>() } + + pub fn edges_for_paths(conn: &Connection, path_ids: Vec) -> HashMap> { + let placeholder_string = path_ids.iter().map(|_| "?").join(","); + let path_edges = PathEdge::query( + conn, + format!( + "select * from path_edges where path_id in ({}) ORDER BY path_id, index_in_path", + placeholder_string + ) + .as_str(), + path_ids + .into_iter() + .map(Value::from) + .collect::>(), + ); + let edge_ids = path_edges + .clone() + .into_iter() + .map(|path_edge| path_edge.edge_id) + .collect::>(); + let edges = Edge::bulk_load(conn, &edge_ids); + let edges_by_id = edges + .into_iter() + .map(|edge| (edge.id, edge)) + .collect::>(); + let path_edges_by_path_id = path_edges + .into_iter() + .map(|path_edge| (path_edge.path_id, path_edge.edge_id)) + .into_group_map(); + path_edges_by_path_id + .into_iter() + .map(|(path_id, edge_ids)| { + ( + path_id, + edge_ids + .into_iter() + .map(|edge_id| edges_by_id[&edge_id].clone()) + .collect::>(), + ) + }) + .collect::>>() + } } mod tests {}