From 30af6139027f593a73ef1666ecf60c2225df9d04 Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 17 Sep 2024 20:07:33 -0400 Subject: [PATCH 1/4] Export paths to GFA --- migrations/core/01-initial/up.sql | 2 +- src/exports/gfa.rs | 90 +++++++++++++++++++++++++++---- src/models/block_group.rs | 2 +- src/models/collection.rs | 2 + src/models/path.rs | 7 ++- src/models/path_edge.rs | 48 ++++++++++++++++- 6 files changed, 135 insertions(+), 16 deletions(-) diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 7e5b6be..0d2a0dd 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -85,7 +85,7 @@ CREATE TABLE path_edges ( FOREIGN KEY(edge_id) REFERENCES edges(id), FOREIGN KEY(path_id) REFERENCES path(id) ) STRICT; -CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id); +CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id, index_in_path); CREATE TABLE block_group_edges ( id INTEGER PRIMARY KEY NOT NULL, diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index e0bf431..1c69543 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -1,4 +1,4 @@ -use itertools::Itertools; +use petgraph::prelude::DiGraphMap; use rusqlite::Connection; use std::collections::{HashMap, HashSet}; use std::fs; @@ -13,6 +13,7 @@ use crate::models::{ collection::Collection, edge::{Edge, GroupBlock}, path::Path, + path_edge::PathEdge, sequence::Sequence, strand::Strand, }; @@ -32,9 +33,58 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) let (graph, edges_by_node_pair) = Edge::build_graph(conn, &edges, &blocks); - let mut file = File::create(filename).unwrap(); + let file = File::create(filename).unwrap(); let mut writer = BufWriter::new(file); + let terminal_block_ids = write_blocks(&mut writer, blocks.clone()); + + write_edges( + &mut writer, + graph, + edges_by_node_pair.clone(), + terminal_block_ids, + ); + + let paths = Path::get_paths_for_collection(conn, collection_name); + let edges_by_path_id = + PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect()); + let node_pairs_by_edge_id = edges_by_node_pair + .iter() + .map(|(node_pair, edge)| (edge.id, *node_pair)) + .collect::>(); + + println!("here1"); + for path in paths { + println!("here2"); + println!("{}", path.name); + let edges_for_path = edges_by_path_id.get(&path.id).unwrap(); + let mut node_ids = vec![]; + let mut node_strands = vec![]; + // Edges actually have too much information, the target of one is the same as the source of + // the next, so just iterate and take the target node to get the path of segments. + for edge in edges_for_path[0..edges_for_path.len() - 1].iter() { + let (_, target) = node_pairs_by_edge_id.get(&edge.id).unwrap(); + node_ids.push(*target); + node_strands.push(edge.target_strand); + } + + writer + .write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes()) + .unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name)); + } +} + +fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> String { + let nodes = node_ids + .iter() + .zip(node_strands.iter()) + .map(|(node_id, node_strand)| format!("{}{}", node_id + 1, node_strand)) + .collect::>() + .join(","); + format!("P\t{}\t{}\n", path_name, nodes) +} + +fn write_blocks(writer: &mut BufWriter, blocks: Vec) -> HashSet { let mut terminal_block_ids = HashSet::new(); for block in &blocks { if block.sequence_hash == Sequence::PATH_START_HASH @@ -53,12 +103,15 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) }); } - let blocks_by_id = blocks - .clone() - .into_iter() - .map(|block| (block.id, block)) - .collect::>(); + terminal_block_ids +} +fn write_edges( + writer: &mut BufWriter, + graph: DiGraphMap, + edges_by_node_pair: HashMap<(i32, i32), Edge>, + terminal_block_ids: HashSet, +) { for (source, target, ()) in graph.all_edges() { if terminal_block_ids.contains(&source) || terminal_block_ids.contains(&target) { continue; @@ -78,7 +131,7 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) } fn segment_line(sequence: &str, index: usize) -> String { - format!("S\t{}\t{}\t{}\n", index, sequence, "*") + format!("S\t{}\t{}\t{}\n", index + 1, sequence, "*") } fn link_line( @@ -89,7 +142,10 @@ fn link_line( ) -> String { format!( "L\t{}\t{}\t{}\t{}\t*\n", - source_index, source_strand, target_index, target_strand + source_index + 1, + source_strand, + target_index + 1, + target_strand ) } @@ -103,7 +159,7 @@ mod tests { block_group::BlockGroup, block_group_edge::BlockGroupEdge, collection::Collection, edge::Edge, sequence::Sequence, }; - use crate::test_helpers::get_connection; + use crate::test_helpers::{get_connection, setup_gen_dir}; use tempfile::tempdir; #[test] @@ -112,7 +168,7 @@ mod tests { let conn = get_connection(None); let collection_name = "test collection"; - let collection = Collection::create(&conn, collection_name); + Collection::create(&conn, collection_name); let block_group = BlockGroup::create(&conn, collection_name, None, "test block group"); let sequence1 = Sequence::new() .sequence_type("DNA") @@ -192,6 +248,14 @@ mod tests { block_group.id, &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); + + Path::create( + &conn, + "1234", + block_group.id, + &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], + ); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); @@ -208,5 +272,9 @@ mod tests { let all_sequences2 = BlockGroup::get_all_sequences(&conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); + + let paths = Path::get_paths_for_collection(&conn, "test collection 2"); + assert_eq!(paths.len(), 1); + assert_eq!(Path::sequence(&conn, paths[0].clone()), "AAAATTTTGGGGCCCC"); } } diff --git a/src/models/block_group.rs b/src/models/block_group.rs index ccd98eb..654c25f 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -160,7 +160,7 @@ impl BlockGroup { BlockGroupEdge::bulk_create(conn, target_block_group_id, &edge_ids); for path in existing_paths { - let edge_ids = PathEdge::edges_for(conn, path.id) + let edge_ids = PathEdge::edges_for_path(conn, path.id) .into_iter() .map(|edge| edge.id) .collect::>(); diff --git a/src/models/collection.rs b/src/models/collection.rs index e60185a..1b65856 100644 --- a/src/models/collection.rs +++ b/src/models/collection.rs @@ -2,6 +2,7 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use crate::models::block_group::BlockGroup; +use crate::models::path::Path; #[derive(Clone, Debug)] pub struct Collection { @@ -15,6 +16,7 @@ impl Collection { .unwrap(); stmt.exists([name]).unwrap() } + pub fn create(conn: &Connection, name: &str) -> Collection { let mut stmt = conn .prepare("INSERT INTO collection (name) VALUES (?1) RETURNING *") diff --git a/src/models/path.rs b/src/models/path.rs index 5932ed6..71b1ef0 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -119,6 +119,11 @@ impl Path { paths } + pub fn get_paths_for_collection(conn: &Connection, collection_name: &str) -> Vec { + let query = "SELECT path.id, path.block_group_id, path.name FROM path JOIN block_group ON path.block_group_id = block_group.id WHERE block_group.collection_name = ?1"; + Path::get_paths(conn, query, vec![Value::from(collection_name.to_string())]) + } + pub fn sequence(conn: &Connection, path: Path) -> String { let blocks = Path::blocks_for(conn, &path); blocks @@ -179,7 +184,7 @@ impl Path { } pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { - let edges = PathEdge::edges_for(conn, path.id); + let edges = PathEdge::edges_for_path(conn, path.id); let mut sequence_hashes = HashSet::new(); for edge in &edges { if edge.source_hash != Sequence::PATH_START_HASH { diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index a7a67a5..4cbf457 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -1,8 +1,10 @@ -use crate::models::edge::Edge; +use itertools::Itertools; use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use std::collections::HashMap; +use crate::models::edge::Edge; + #[derive(Clone, Debug)] pub struct PathEdge { pub id: i32, @@ -72,7 +74,7 @@ impl PathEdge { objs } - pub fn edges_for(conn: &Connection, path_id: i32) -> Vec { + pub fn edges_for_path(conn: &Connection, path_id: i32) -> Vec { let path_edges = PathEdge::query( conn, "select * from path_edges where path_id = ?1 order by index_in_path ASC", @@ -92,6 +94,48 @@ impl PathEdge { .map(|edge_id| edges_by_id[&edge_id].clone()) .collect::>() } + + pub fn edges_for_paths(conn: &Connection, path_ids: Vec) -> HashMap> { + let placeholder_string = path_ids.iter().map(|_| "?").join(","); + let path_edges = PathEdge::query( + conn, + format!( + "select id, path_id, index_in_path, edge_id from path_edges where path_id in ({})", + placeholder_string + ) + .as_str(), + path_ids + .into_iter() + .map(Value::from) + .collect::>(), + ); + let edge_ids = path_edges + .clone() + .into_iter() + .map(|path_edge| path_edge.edge_id) + .collect::>(); + let edges = Edge::bulk_load(conn, &edge_ids); + let edges_by_id = edges + .into_iter() + .map(|edge| (edge.id, edge)) + .collect::>(); + let path_edges_by_path_id = path_edges + .into_iter() + .map(|path_edge| (path_edge.path_id, path_edge.edge_id)) + .into_group_map(); + path_edges_by_path_id + .into_iter() + .map(|(path_id, edge_ids)| { + ( + path_id, + edge_ids + .into_iter() + .map(|edge_id| edges_by_id[&edge_id].clone()) + .collect::>(), + ) + }) + .collect::>>() + } } mod tests {} From e7bf2c7ac255373489f4696f909c436534dd44b9 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 18 Sep 2024 12:23:46 -0400 Subject: [PATCH 2/4] Refactor, add test, fix test --- src/exports/gfa.rs | 143 +++++++++++++++++++++++++--------------- src/models/path_edge.rs | 2 +- 2 files changed, 92 insertions(+), 53 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 1c69543..7e6d333 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -36,61 +36,33 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) let file = File::create(filename).unwrap(); let mut writer = BufWriter::new(file); - let terminal_block_ids = write_blocks(&mut writer, blocks.clone()); + let mut terminal_block_ids = HashSet::new(); + for block in &blocks { + if block.sequence_hash == Sequence::PATH_START_HASH + || block.sequence_hash == Sequence::PATH_END_HASH + { + terminal_block_ids.insert(block.id); + continue; + } + } - write_edges( + write_segments(&mut writer, blocks.clone(), terminal_block_ids.clone()); + write_links( &mut writer, graph, edges_by_node_pair.clone(), terminal_block_ids, ); - - let paths = Path::get_paths_for_collection(conn, collection_name); - let edges_by_path_id = - PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect()); - let node_pairs_by_edge_id = edges_by_node_pair - .iter() - .map(|(node_pair, edge)| (edge.id, *node_pair)) - .collect::>(); - - println!("here1"); - for path in paths { - println!("here2"); - println!("{}", path.name); - let edges_for_path = edges_by_path_id.get(&path.id).unwrap(); - let mut node_ids = vec![]; - let mut node_strands = vec![]; - // Edges actually have too much information, the target of one is the same as the source of - // the next, so just iterate and take the target node to get the path of segments. - for edge in edges_for_path[0..edges_for_path.len() - 1].iter() { - let (_, target) = node_pairs_by_edge_id.get(&edge.id).unwrap(); - node_ids.push(*target); - node_strands.push(edge.target_strand); - } - - writer - .write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes()) - .unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name)); - } -} - -fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> String { - let nodes = node_ids - .iter() - .zip(node_strands.iter()) - .map(|(node_id, node_strand)| format!("{}{}", node_id + 1, node_strand)) - .collect::>() - .join(","); - format!("P\t{}\t{}\n", path_name, nodes) + write_paths(&mut writer, conn, collection_name, edges_by_node_pair); } -fn write_blocks(writer: &mut BufWriter, blocks: Vec) -> HashSet { - let mut terminal_block_ids = HashSet::new(); +fn write_segments( + writer: &mut BufWriter, + blocks: Vec, + terminal_block_ids: HashSet, +) { for block in &blocks { - if block.sequence_hash == Sequence::PATH_START_HASH - || block.sequence_hash == Sequence::PATH_END_HASH - { - terminal_block_ids.insert(block.id); + if terminal_block_ids.contains(&block.id) { continue; } writer @@ -102,11 +74,13 @@ fn write_blocks(writer: &mut BufWriter, blocks: Vec) -> HashSe ) }); } +} - terminal_block_ids +fn segment_line(sequence: &str, index: usize) -> String { + format!("S\t{}\t{}\t{}\n", index + 1, sequence, "*") } -fn write_edges( +fn write_links( writer: &mut BufWriter, graph: DiGraphMap, edges_by_node_pair: HashMap<(i32, i32), Edge>, @@ -130,10 +104,6 @@ fn write_edges( } } -fn segment_line(sequence: &str, index: usize) -> String { - format!("S\t{}\t{}\t{}\n", index + 1, sequence, "*") -} - fn link_line( source_index: i32, source_strand: Strand, @@ -149,6 +119,48 @@ fn link_line( ) } +fn write_paths( + writer: &mut BufWriter, + conn: &Connection, + collection_name: &str, + edges_by_node_pair: HashMap<(i32, i32), Edge>, +) { + let paths = Path::get_paths_for_collection(conn, collection_name); + let edges_by_path_id = + PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect()); + let node_pairs_by_edge_id = edges_by_node_pair + .iter() + .map(|(node_pair, edge)| (edge.id, *node_pair)) + .collect::>(); + + for path in paths { + let edges_for_path = edges_by_path_id.get(&path.id).unwrap(); + let mut node_ids = vec![]; + let mut node_strands = vec![]; + // Edges actually have too much information, the target of one is the same as the source of + // the next, so just iterate and take the target node to get the path of segments. + for edge in edges_for_path[0..edges_for_path.len() - 1].iter() { + let (_, target) = node_pairs_by_edge_id.get(&edge.id).unwrap(); + node_ids.push(*target); + node_strands.push(edge.target_strand); + } + + writer + .write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes()) + .unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name)); + } +} + +fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> String { + let nodes = node_ids + .iter() + .zip(node_strands.iter()) + .map(|(node_id, node_strand)| format!("{}{}", node_id + 1, node_strand)) + .collect::>() + .join(","); + format!("P\t{}\t{}\n", path_name, nodes) +} + mod tests { use rusqlite::Connection; // Note this useful idiom: importing names from outer (for mod tests) scope. @@ -277,4 +289,31 @@ mod tests { assert_eq!(paths.len(), 1); assert_eq!(Path::sequence(&conn, paths[0].clone()), "AAAATTTTGGGGCCCC"); } + + #[test] + fn test_simple_round_trip() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/simple.gfa"); + let collection_name = "test".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + + let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); + let mut gfa_path = PathBuf::from(temp_dir.path()); + gfa_path.push("intermediate.gfa"); + + export_gfa(conn, &collection_name, &gfa_path); + import_gfa(&gfa_path, "test collection 2", conn); + + let block_group2 = Collection::get_block_groups(conn, "test collection 2") + .pop() + .unwrap(); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + + assert_eq!(all_sequences, all_sequences2); + } } diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index 4cbf457..f5e8b80 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -100,7 +100,7 @@ impl PathEdge { let path_edges = PathEdge::query( conn, format!( - "select id, path_id, index_in_path, edge_id from path_edges where path_id in ({})", + "select id, path_id, index_in_path, edge_id from path_edges where path_id in ({}) ORDER BY path_id, index_in_path", placeholder_string ) .as_str(), From f6d5c3429e39e803479d6c830ca649d9e509271e Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 18 Sep 2024 15:15:31 -0400 Subject: [PATCH 3/4] Handle sequence splits --- src/exports/gfa.rs | 122 ++++++++++++++++++++++++++++++++++++++++++--- src/imports/gfa.rs | 29 +++++++++++ 2 files changed, 143 insertions(+), 8 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 7e6d333..44dde5d 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -1,3 +1,4 @@ +use itertools::Itertools; use petgraph::prelude::DiGraphMap; use rusqlite::Connection; use std::collections::{HashMap, HashSet}; @@ -53,7 +54,13 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) edges_by_node_pair.clone(), terminal_block_ids, ); - write_paths(&mut writer, conn, collection_name, edges_by_node_pair); + write_paths( + &mut writer, + conn, + collection_name, + edges_by_node_pair, + &blocks, + ); } fn write_segments( @@ -119,11 +126,42 @@ fn link_line( ) } +// NOTE: A path is an immutable list of edges, but the sequence between the target of one edge and +// the source of the next may be "split" by later operations that add edges with sources or targets +// on a sequence that are in between those of a consecutive pair of edges in a path. This function +// handles that case by collecting all the nodes between the target of one edge and the source of +// the next. +fn nodes_for_edges( + edge1: &Edge, + edge2: &Edge, + blocks_by_hash_and_start: &HashMap<(&str, i32), GroupBlock>, + blocks_by_hash_and_end: &HashMap<(&str, i32), GroupBlock>, +) -> Vec { + let current_block = blocks_by_hash_and_start + .get(&(edge1.target_hash.as_str(), edge1.target_coordinate)) + .unwrap(); + let end_block = blocks_by_hash_and_end + .get(&(edge2.source_hash.as_str(), edge2.source_coordinate)) + .unwrap(); + let mut node_ids = vec![]; + #[allow(clippy::while_immutable_condition)] + while current_block.id != end_block.id { + node_ids.push(current_block.id); + let current_block = blocks_by_hash_and_start + .get(&(current_block.sequence_hash.as_str(), current_block.end)) + .unwrap(); + } + node_ids.push(end_block.id); + + node_ids +} + fn write_paths( writer: &mut BufWriter, conn: &Connection, collection_name: &str, edges_by_node_pair: HashMap<(i32, i32), Edge>, + blocks: &[GroupBlock], ) { let paths = Path::get_paths_for_collection(conn, collection_name); let edges_by_path_id = @@ -133,16 +171,30 @@ fn write_paths( .map(|(node_pair, edge)| (edge.id, *node_pair)) .collect::>(); + let blocks_by_hash_and_start = blocks + .iter() + .map(|block| ((block.sequence_hash.as_str(), block.start), block.clone())) + .collect::>(); + let blocks_by_hash_and_end = blocks + .iter() + .map(|block| ((block.sequence_hash.as_str(), block.end), block.clone())) + .collect::>(); + for path in paths { let edges_for_path = edges_by_path_id.get(&path.id).unwrap(); let mut node_ids = vec![]; let mut node_strands = vec![]; - // Edges actually have too much information, the target of one is the same as the source of - // the next, so just iterate and take the target node to get the path of segments. - for edge in edges_for_path[0..edges_for_path.len() - 1].iter() { - let (_, target) = node_pairs_by_edge_id.get(&edge.id).unwrap(); - node_ids.push(*target); - node_strands.push(edge.target_strand); + for (edge1, edge2) in edges_for_path.iter().tuple_windows() { + let current_node_ids = nodes_for_edges( + edge1, + edge2, + &blocks_by_hash_and_start, + &blocks_by_hash_and_end, + ); + for node_id in ¤t_node_ids { + node_ids.push(*node_id); + node_strands.push(edge1.target_strand); + } } writer @@ -155,7 +207,7 @@ fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> Stri let nodes = node_ids .iter() .zip(node_strands.iter()) - .map(|(node_id, node_strand)| format!("{}{}", node_id + 1, node_strand)) + .map(|(node_id, node_strand)| format!("{}{}", *node_id + 1, node_strand)) .collect::>() .join(","); format!("P\t{}\t{}\n", path_name, nodes) @@ -316,4 +368,58 @@ mod tests { assert_eq!(all_sequences, all_sequences2); } + + #[test] + fn test_anderson_round_trip() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/anderson_promoters.gfa"); + let collection_name = "anderson promoters".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + + let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); + let mut gfa_path = PathBuf::from(temp_dir.path()); + gfa_path.push("intermediate.gfa"); + + export_gfa(conn, &collection_name, &gfa_path); + import_gfa(&gfa_path, "anderson promoters 2", conn); + + let block_group2 = Collection::get_block_groups(conn, "anderson promoters 2") + .pop() + .unwrap(); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + + assert_eq!(all_sequences, all_sequences2); + } + + #[test] + fn test_reverse_strand_round_trip() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/reverse_strand.gfa"); + let collection_name = "test".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + + let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); + let mut gfa_path = PathBuf::from(temp_dir.path()); + gfa_path.push("intermediate.gfa"); + + export_gfa(conn, &collection_name, &gfa_path); + import_gfa(&gfa_path, "test collection 2", conn); + + let block_group2 = Collection::get_block_groups(conn, "test collection 2") + .pop() + .unwrap(); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + + assert_eq!(all_sequences, all_sequences2); + } } diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 82a84cf..e3dc8cd 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -302,4 +302,33 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "TATGCCAGCTGCGAATA"); } + + #[test] + fn test_import_anderson_promoters() { + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/anderson_promoters.gfa"); + let collection_name = "anderson promoters".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let paths = Path::get_paths_for_collection(conn, &collection_name); + assert_eq!(paths.len(), 20); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let path = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(block_group_id), + SQLValue::from("BBa_J23100".to_string()), + ], + )[0] + .clone(); + + let result = Path::sequence(conn, path); + let expected_sequence_parts = vec!["T", "T", "G", "A", "C", "G", "GCTAGCTCAG", "T", "CCT", "A", "GG", "T", "A", "C", "A", "G", + "TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"]; + let expected_sequence = expected_sequence_parts.join(""); + assert_eq!(result, expected_sequence); + } } From 2b9b0faed334385185ffd290687eaea2e22aaeb6 Mon Sep 17 00:00:00 2001 From: hofer Date: Mon, 23 Sep 2024 12:59:52 -0400 Subject: [PATCH 4/4] Address comment --- src/models/path.rs | 2 +- src/models/path_edge.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/path.rs b/src/models/path.rs index 71b1ef0..5600f61 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -120,7 +120,7 @@ impl Path { } pub fn get_paths_for_collection(conn: &Connection, collection_name: &str) -> Vec { - let query = "SELECT path.id, path.block_group_id, path.name FROM path JOIN block_group ON path.block_group_id = block_group.id WHERE block_group.collection_name = ?1"; + let query = "SELECT * FROM path JOIN block_group ON path.block_group_id = block_group.id WHERE block_group.collection_name = ?1"; Path::get_paths(conn, query, vec![Value::from(collection_name.to_string())]) } diff --git a/src/models/path_edge.rs b/src/models/path_edge.rs index f5e8b80..750d9b0 100644 --- a/src/models/path_edge.rs +++ b/src/models/path_edge.rs @@ -100,7 +100,7 @@ impl PathEdge { let path_edges = PathEdge::query( conn, format!( - "select id, path_id, index_in_path, edge_id from path_edges where path_id in ({}) ORDER BY path_id, index_in_path", + "select * from path_edges where path_id in ({}) ORDER BY path_id, index_in_path", placeholder_string ) .as_str(),