diff --git a/fixtures/general.vcf b/fixtures/general.vcf new file mode 100644 index 0000000..4aaa543 --- /dev/null +++ b/fixtures/general.vcf @@ -0,0 +1,54 @@ +##fileformat=VCFv4.1 +##filedate=Tue Sep 4 13:12:57 2018 +##reference=simple.fa +##contig= +##phasing=none +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##filter="QUAL > 1" +##FILTER= +#CHROM POS ID REF ALT QUAL FILTER INFO +m123 3 . CGA CA 1611.92 . AB=0;ABP=0;AC=2;AF=1;AN=2;AO=53;CIGAR=1M1D1M;DP=56;DPB=38;DPRA=0;EPP=3.37904;EPPR=0;GTI=0;LEN=1;MEANALT=3;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=76.9802;PAIRED=1;PAIREDR=0;PAO=0;PQA=0;PQR=0;PRO=0;QA=1837;QR=0;RO=0;RPL=18;RPP=14.851;RPPR=0;RPR=35;RUN=1;SAF=27;SAP=3.05127;SAR=26;SRF=0;SRP=0;SRR=0;TYPE=del +m123 10 . TC TAGA 2216.6 . AB=0;ABP=0;AC=2;AF=1;AN=2;AO=74;CIGAR=1M2I1X;DP=76;DPB=154.5;DPRA=0;EPP=4.06669;EPPR=0;GTI=0;LEN=3;MEANALT=3;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=113.024;PAIRED=0.986486;PAIREDR=0;PAO=2.5;PQA=92.5;PQR=92.5;PRO=2.5;QA=2502;QR=0;RO=0;RPL=31;RPP=7.23587;RPPR=0;RPR=43;RUN=1;SAF=37;SAP=3.0103;SAR=37;SRF=0;SRP=0;SRR=0;TYPE=complex diff --git a/migrations/01-initial/up.sql b/migrations/01-initial/up.sql index 1863b07..d6a797f 100644 --- a/migrations/01-initial/up.sql +++ b/migrations/01-initial/up.sql @@ -65,4 +65,18 @@ CREATE TABLE path_blocks ( FOREIGN KEY(target_block_id) REFERENCES block(id), FOREIGN KEY(path_id) REFERENCES path(id) ); -CREATE UNIQUE INDEX path_blocks_uidx ON path_blocks(path_id, source_block_id, target_block_id); \ No newline at end of file +CREATE UNIQUE INDEX path_blocks_uidx ON path_blocks(path_id, source_block_id, target_block_id); + +CREATE TABLE change_log ( + hash TEXT PRIMARY KEY NOT NULL, + path_id INTEGER NOT NULL, + path_start INTEGER NOT NULL, + path_end INTEGER NOT NULL, + sequence_hash TEXT NOT NULL, + sequence_start INTEGER NOT NULL, + sequence_end INTEGER NOT NULL, + sequence_strand TEXT NOT NULL, + FOREIGN KEY(path_id) REFERENCES path(id), + FOREIGN KEY(sequence_hash) REFERENCES sequence(hash) +); +CREATE UNIQUE INDEX change_log_uidx ON change_log(hash); \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index fed39ca..92dcad5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ pub mod migrations; pub mod models; use crate::migrations::run_migrations; +use noodles::vcf::variant::record::samples::series::value::genotype::Phasing; use rusqlite::Connection; use sha2::{Digest, Sha256}; @@ -30,6 +31,51 @@ pub fn calculate_hash(t: &str) -> String { format!("{:x}", result) } +pub struct Genotype { + pub allele: i32, + pub phasing: Phasing, +} + +pub fn parse_genotype(gt: &str) -> Vec> { + let mut genotypes = vec![]; + let mut phase = match gt.contains("/") { + true => Phasing::Unphased, + false => Phasing::Phased, + }; + for (index, entry) in gt.split_inclusive(|c| c == '|' || c == '/').enumerate() { + let mut allele; + let mut phasing = Phasing::Unphased; + if entry.ends_with(['/', '|']) { + let (allele_str, phasing_str) = entry.split_at(entry.len() - 1); + allele = allele_str; + phasing = match phasing_str == "|" { + true => Phasing::Phased, + false => Phasing::Unphased, + } + } else { + allele = entry; + } + if allele == "." { + genotypes.push(None); + } else { + genotypes.push(Some(Genotype { + allele: allele.parse::().unwrap(), + phasing: phase, + })); + } + // we're always 1 behind on phase, e.g. 0|1, the | is the phase of the next allele + phase = phasing; + } + genotypes +} + +pub fn get_overlap(a: i32, b: i32, x: i32, y: i32) -> (bool, bool, bool) { + let contains_start = a <= x && x < b; + let contains_end = a < y && y < b; + let overlap = a < y && x < b; + (contains_start, contains_end, overlap) +} + #[cfg(test)] mod tests { use super::*; @@ -62,4 +108,62 @@ mod tests { .unwrap(); assert_eq!(sequence_count, 0); } + + #[test] + fn parses_genotype() { + let genotypes = parse_genotype("1"); + let genotype_1 = genotypes[0].as_ref().unwrap(); + assert_eq!(genotype_1.allele, 1); + assert_eq!(genotype_1.phasing, Phasing::Phased); + let genotypes = parse_genotype("0|1"); + let genotype_1 = genotypes[0].as_ref().unwrap(); + let genotype_2 = genotypes[1].as_ref().unwrap(); + assert_eq!(genotype_1.allele, 0); + assert_eq!(genotype_1.phasing, Phasing::Phased); + assert_eq!(genotype_2.allele, 1); + assert_eq!(genotype_2.phasing, Phasing::Phased); + let genotypes = parse_genotype("0/1"); + let genotype_1 = genotypes[0].as_ref().unwrap(); + let genotype_2 = genotypes[1].as_ref().unwrap(); + assert_eq!(genotype_1.allele, 0); + assert_eq!(genotype_1.phasing, Phasing::Unphased); + assert_eq!(genotype_2.allele, 1); + assert_eq!(genotype_2.phasing, Phasing::Unphased); + let genotypes = parse_genotype("0/1|2"); + let genotype_1 = genotypes[0].as_ref().unwrap(); + let genotype_2 = genotypes[1].as_ref().unwrap(); + let genotype_3 = genotypes[2].as_ref().unwrap(); + assert_eq!(genotype_1.allele, 0); + assert_eq!(genotype_1.phasing, Phasing::Unphased); + assert_eq!(genotype_2.allele, 1); + assert_eq!(genotype_2.phasing, Phasing::Unphased); + assert_eq!(genotype_3.allele, 2); + assert_eq!(genotype_3.phasing, Phasing::Phased); + let genotypes = parse_genotype("2|1|2"); + let genotype_1 = genotypes[0].as_ref().unwrap(); + let genotype_2 = genotypes[1].as_ref().unwrap(); + let genotype_3 = genotypes[2].as_ref().unwrap(); + assert_eq!(genotype_1.allele, 2); + assert_eq!(genotype_1.phasing, Phasing::Phased); + assert_eq!(genotype_2.allele, 1); + assert_eq!(genotype_2.phasing, Phasing::Phased); + assert_eq!(genotype_3.allele, 2); + assert_eq!(genotype_3.phasing, Phasing::Phased); + let genotypes = parse_genotype("2|.|2"); + let genotype_1 = genotypes[0].as_ref().unwrap(); + let genotype_3 = genotypes[2].as_ref().unwrap(); + assert_eq!(genotype_1.allele, 2); + assert_eq!(genotype_1.phasing, Phasing::Phased); + assert_eq!(genotype_3.allele, 2); + assert_eq!(genotype_3.phasing, Phasing::Phased); + assert!(genotypes[1].is_none()); + } + + #[test] + fn test_overlaps() { + assert_eq!(get_overlap(0, 10, 10, 10), (false, false, false)); + assert_eq!(get_overlap(10, 20, 10, 20), (true, false, true)); + assert_eq!(get_overlap(10, 20, 5, 15), (false, true, true)); + assert_eq!(get_overlap(10, 20, 0, 10), (false, false, false)); + } } diff --git a/src/main.rs b/src/main.rs index bc277f5..5735b87 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,14 +5,15 @@ use std::fmt::Debug; use std::path::PathBuf; use bio::io::fasta; -use gen::get_connection; use gen::migrations::run_migrations; use gen::models::{self, block::Block, edge::Edge, path::Path, sequence::Sequence, BlockGroup}; +use gen::{get_connection, parse_genotype}; use noodles::vcf; use noodles::vcf::variant::record::samples::series::value::genotype::Phasing; use noodles::vcf::variant::record::samples::series::Value; use noodles::vcf::variant::record::samples::{Sample, Series}; use noodles::vcf::variant::record::{AlternateBases, ReferenceBases, Samples}; +use noodles::vcf::variant::record_buf::samples::sample::value::genotype::Genotype; use noodles::vcf::variant::Record; use rusqlite::{types::Value as SQLValue, Connection}; use std::io; @@ -55,10 +56,17 @@ enum Commands { /// A VCF file to incorporate #[arg(short, long)] vcf: String, + /// If no genotype is provided, enter the genotype to assign variants + #[arg(short, long)] + genotype: Option, + /// If no sample is provided, enter the sample to associate variants to + #[arg(short, long)] + sample: Option, }, } fn import_fasta(fasta: &String, name: &String, shallow: bool, conn: &mut Connection) { + // TODO: support gz let mut reader = fasta::Reader::from_file(fasta).unwrap(); run_migrations(conn); @@ -90,7 +98,13 @@ fn import_fasta(fasta: &String, name: &String, shallow: bool, conn: &mut Connect } } -fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Connection) { +fn update_with_vcf( + vcf_path: &String, + collection_name: &String, + fixed_genotype: String, + fixed_sample: String, + conn: &mut Connection, +) { run_migrations(conn); let mut reader = vcf::io::reader::Builder::default() @@ -101,6 +115,13 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne for name in sample_names { models::Sample::create(conn, name); } + if !fixed_sample.is_empty() { + models::Sample::create(conn, &fixed_sample); + } + let mut genotype = vec![]; + if !fixed_genotype.is_empty() { + genotype = parse_genotype(&fixed_genotype); + } for result in reader.records() { let record = result.unwrap(); let seq_name = record.reference_sequence_name().to_string(); @@ -110,58 +131,106 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne let ref_end = record.variant_end(&header).unwrap().get(); let alt_bases = record.alternate_bases(); let alt_alleles: Vec<_> = alt_bases.iter().collect::>().unwrap(); - for (sample_index, sample) in record.samples().iter().enumerate() { - let genotype = sample.get(&header, "GT"); - if genotype.is_some() { - if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() { - for (chromosome_index, gt) in genotypes.iter().enumerate() { - if gt.is_ok() { - let (allele, phasing) = gt.unwrap(); - let phased = match phasing { - Phasing::Phased => 1, - Phasing::Unphased => 0, - }; - let allele = allele.unwrap(); - if allele != 0 { - let alt_seq = alt_alleles[allele - 1]; - // TODO: new sequence may not be real and be or some sort. Handle these. - let new_sequence_hash = Sequence::create( - conn, - "DNA".to_string(), - &alt_seq.to_string(), - true, - ); - let sample_bg_id = BlockGroup::get_or_create_sample_block_group( - conn, - collection_name, - &sample_names[sample_index], - &seq_name, - ); - let sample_path_id = Path::get_paths( - conn, - "select * from path where block_group_id = ?1 AND name = ?2", - vec![ - SQLValue::from(sample_bg_id), - SQLValue::from(seq_name.clone()), - ], - ); - let new_block_id = Block::create( - conn, - &new_sequence_hash, - sample_bg_id, - 0, - alt_seq.len() as i32, - &"1".to_string(), - ); - BlockGroup::insert_change( - conn, - sample_path_id[0].id, - ref_start as i32, - ref_end as i32, - new_block_id.id, - chromosome_index as i32, - phased, - ); + if !fixed_sample.is_empty() && !genotype.is_empty() { + for (chromosome_index, genotype) in genotype.iter().enumerate() { + if let Some(gt) = genotype { + if gt.allele != 0 { + let alt_seq = alt_alleles[chromosome_index - 1]; + let phased = match gt.phasing { + Phasing::Phased => 1, + Phasing::Unphased => 0, + }; + // TODO: new sequence may not be real and be or some sort. Handle these. + let new_sequence_hash = + Sequence::create(conn, "DNA".to_string(), &alt_seq.to_string(), true); + let sample_bg_id = BlockGroup::get_or_create_sample_block_group( + conn, + collection_name, + &fixed_sample, + &seq_name, + ); + let sample_path_id = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(sample_bg_id), + SQLValue::from(seq_name.clone()), + ], + ); + let new_block_id = Block::create( + conn, + &new_sequence_hash, + sample_bg_id, + 0, + alt_seq.len() as i32, + &"1".to_string(), + ); + BlockGroup::insert_change( + conn, + sample_path_id[0].id, + ref_start as i32, + ref_end as i32, + &new_block_id, + chromosome_index as i32, + phased, + ); + } + } + } + } else { + for (sample_index, sample) in record.samples().iter().enumerate() { + let genotype = sample.get(&header, "GT"); + if genotype.is_some() { + if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() { + for (chromosome_index, gt) in genotypes.iter().enumerate() { + if gt.is_ok() { + let (allele, phasing) = gt.unwrap(); + let phased = match phasing { + Phasing::Phased => 1, + Phasing::Unphased => 0, + }; + let allele = allele.unwrap(); + if allele != 0 { + let alt_seq = alt_alleles[allele - 1]; + // TODO: new sequence may not be real and be or some sort. Handle these. + let new_sequence_hash = Sequence::create( + conn, + "DNA".to_string(), + &alt_seq.to_string(), + true, + ); + let sample_bg_id = BlockGroup::get_or_create_sample_block_group( + conn, + collection_name, + &sample_names[sample_index], + &seq_name, + ); + let sample_path_id = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(sample_bg_id), + SQLValue::from(seq_name.clone()), + ], + ); + let new_block_id = Block::create( + conn, + &new_sequence_hash, + sample_bg_id, + 0, + alt_seq.len() as i32, + &"1".to_string(), + ); + BlockGroup::insert_change( + conn, + sample_path_id[0].id, + ref_start as i32, + ref_end as i32, + &new_block_id, + chromosome_index as i32, + phased, + ); + } } } } @@ -186,7 +255,15 @@ fn main() { db, fasta, vcf, - }) => update_with_vcf(vcf, name, &mut get_connection(db)), + genotype, + sample, + }) => update_with_vcf( + vcf, + name, + genotype.clone().unwrap_or("".to_string()), + sample.clone().unwrap_or("".to_string()), + &mut get_connection(db), + ), None => {} } } @@ -232,7 +309,13 @@ mod tests { false, conn, ); - update_with_vcf(&vcf_path.to_str().unwrap().to_string(), &collection, conn); + update_with_vcf( + &vcf_path.to_str().unwrap().to_string(), + &collection, + "".to_string(), + "".to_string(), + conn, + ); assert_eq!( BlockGroup::get_all_sequences(conn, 1), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) diff --git a/src/models.rs b/src/models.rs index b8f57de..9cd7cd1 100644 --- a/src/models.rs +++ b/src/models.rs @@ -5,6 +5,7 @@ use petgraph::visit::{Dfs, IntoNeighborsDirected, NodeCount}; use petgraph::Direction; use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; +use sha2::{Digest, Sha256}; use std::collections::{HashMap, HashSet}; use std::fmt::*; use std::hash::Hash; @@ -14,11 +15,11 @@ pub mod edge; pub mod path; pub mod sequence; -use crate::models; use crate::models::block::Block; use crate::models::edge::Edge; use crate::models::path::{all_simple_paths, Path, PathBlock}; use crate::models::sequence::Sequence; +use crate::{get_overlap, models}; #[derive(Debug)] pub struct Collection { @@ -66,12 +67,22 @@ pub struct Sample { impl Sample { pub fn create(conn: &mut Connection, name: &String) -> Sample { let mut stmt = conn - .prepare("INSERT INTO sample (name) VALUES (?1) RETURNING *") + .prepare("INSERT INTO sample (name) VALUES (?1)") .unwrap(); - let mut rows = stmt - .query_map((name,), |row| Ok(Sample { name: row.get(0)? })) - .unwrap(); - rows.next().unwrap().unwrap() + match stmt.execute((name,)) { + Ok(_) => Sample { name: name.clone() }, + Err(rusqlite::Error::SqliteFailure(err, details)) => { + if err.code == rusqlite::ErrorCode::ConstraintViolation { + println!("{err:?} {details:?}"); + Sample { name: name.clone() } + } else { + panic!("something bad happened querying the database") + } + } + Err(_) => { + panic!("something bad happened querying the database") + } + } } } @@ -100,7 +111,7 @@ impl BlockGroup { name: row.get(3)?, }) }) { - Ok(path) => path, + Ok(res) => res, Err(rusqlite::Error::SqliteFailure(err, details)) => { if err.code == rusqlite::ErrorCode::ConstraintViolation { println!("{err:?} {details:?}"); @@ -331,11 +342,23 @@ impl BlockGroup { path_id: i32, start: i32, end: i32, - new_block_id: i32, + new_block: &Block, chromosome_index: i32, phased: i32, ) { - println!("change is {path_id} {start} {end} {new_block_id}"); + let new_block_id = new_block.id; + let change = ChangeLog::new( + path_id, + start, + end, + new_block.sequence_hash.clone(), + new_block.start, + new_block.end, + new_block.strand.clone(), + ); + if ChangeLog::exists(conn, &change.hash) { + return; + } // todo: // 1. get blocks where start-> end overlap // 2. split old blocks at boundary points, make new block for left/right side @@ -354,7 +377,6 @@ impl BlockGroup { let path = Path::get(conn, path_id); let graph = PathBlock::blocks_to_graph(conn, path.id); - println!("{path:?} {graph:?}"); let query = format!("SELECT id, sequence_hash, block_group_id, start, end, strand from block where id in ({block_ids})", block_ids = graph.nodes().map(|k| format!("{k}")).collect::>().join(",")); let mut stmt = conn.prepare(&query).unwrap(); let mut blocks: HashMap = HashMap::new(); @@ -381,14 +403,22 @@ impl BlockGroup { let mut path_end = 0; let mut new_edges = vec![]; let mut previous_block: Option<&Block> = None; + let mut loose_connection = false; for block_id in &path.blocks { let block = blocks.get(block_id).unwrap(); let block_length = (block.end - block.start); path_end += block_length; - let contains_start = path_start <= start && start < path_end; - let contains_end = path_start <= end && end < path_end; - let overlap = path_start <= end && start <= path_end; + if loose_connection && path_start == end { + new_edges.push((Some(new_block_id), Some(block.id))); + loose_connection = false; + } + + let (contains_start, contains_end, overlap) = + get_overlap(path_start, path_end, start, end); + println!( + "{path_start} {path_end} {start} {end} {contains_start} {contains_end} {overlap}" + ); if contains_start && contains_end { // our range is fully contained w/in the block @@ -428,15 +458,22 @@ impl BlockGroup { // our range is overlapping the end of the block // |----block---| // |----range---| - let (left_block, right_block) = Block::split( - conn, - block, - block.start + start - path_start, - chromosome_index, - phased, - ) - .unwrap(); - Block::delete(conn, block.id); + let split_point = block.start + start - path_start; + println!("{block:?} {split_point}"); + if split_point == block.start { + if let Some(pb) = previous_block { + new_edges.push((Some(pb.id), Some(new_block_id))); + } + // we actually are skipping this block + if path_end >= end { + loose_connection = true; + } + } else { + let (left_block, right_block) = + Block::split(conn, block, split_point, chromosome_index, phased).unwrap(); + Block::delete(conn, block.id); + new_edges.push((Some(left_block.id), Some(new_block_id))); + } // let left_block = Block::create( // conn, // &block.sequence_hash, @@ -450,37 +487,15 @@ impl BlockGroup { // } else { // new_edges.push((None, Some(left_block.id))); // } - new_edges.push((Some(left_block.id), Some(new_block_id))); } else if contains_end { // our range is overlapping the beginning of the block // |----block---| // |----range---| - let (left_block, right_block) = Block::split( - conn, - block, - block.start + end - path_start, - chromosome_index, - phased, - ) - .unwrap(); + let split_point = block.start + end - path_start; + let (left_block, right_block) = + Block::split(conn, block, split_point, chromosome_index, phased).unwrap(); Block::delete(conn, block.id); - // let right_block = Block::create( - // conn, - // &block.sequence_hash, - // block_group_id, - // end - path_start, - // block.end, - // &block.strand, - // ); - // // what stuff went to this block? new_edges.push((Some(new_block_id), Some(right_block.id))); - // let last_node = dfs.next(&graph); - // if last_node.is_some() { - // let next_block = blocks.get(&(last_node.unwrap() as i32)).unwrap(); - // new_edges.push((Some(right_block.id), Some(next_block.id))); - // } else { - // new_edges.push((Some(right_block.id), None)) - // } break; } else if overlap { // our range is the whole block, ignore it @@ -500,6 +515,79 @@ impl BlockGroup { for new_edge in new_edges { Edge::create(conn, new_edge.0, new_edge.1, chromosome_index, phased); } + + change.save(conn); + } +} + +pub struct ChangeLog { + hash: String, + path_id: i32, + path_start: i32, + path_end: i32, + seq_hash: String, + seq_start: i32, + seq_end: i32, + strand: String, +} + +impl ChangeLog { + pub fn new( + path_id: i32, + path_start: i32, + path_end: i32, + seq_hash: String, + seq_start: i32, + seq_end: i32, + seq_strand: String, + ) -> ChangeLog { + let mut hasher = Sha256::new(); + hasher.update(path_id.to_string()); + hasher.update(path_start.to_string()); + hasher.update(path_end.to_string()); + hasher.update(&seq_hash); + hasher.update(seq_start.to_string()); + hasher.update(seq_end.to_string()); + hasher.update(&seq_strand); + let result = hasher.finalize(); + let hash = format!("{:x}", result); + ChangeLog { + hash, + path_id, + path_start, + path_end, + seq_hash, + seq_start, + seq_end, + strand: seq_strand, + } + } + + pub fn save(&self, conn: &Connection) { + ChangeLog::create(conn, self); + } + + pub fn create(conn: &Connection, change_log: &ChangeLog) { + let mut stmt = conn + .prepare("INSERT INTO change_log (hash, path_id, path_start, path_end, sequence_hash, sequence_start, sequence_end, sequence_strand) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8);") + .unwrap(); + let placeholders = vec![ + Value::from(change_log.hash.clone()), + Value::from(change_log.path_id), + Value::from(change_log.path_start), + Value::from(change_log.path_end), + Value::from(change_log.seq_hash.clone()), + Value::from(change_log.seq_start), + Value::from(change_log.seq_end), + Value::from(change_log.strand.clone()), + ]; + stmt.execute(params_from_iter(placeholders)).unwrap(); + } + + pub fn exists(conn: &mut Connection, hash: &String) -> bool { + let query = "SELECT hash from change_log where hash = ?1;"; + let mut stmt = conn.prepare(query).unwrap(); + stmt.exists((hash,)).unwrap() } } @@ -544,9 +632,8 @@ mod tests { } #[test] - fn simple_insert() { - fs::remove_file("test.db"); - let mut conn = get_db_connection("test.db"); + fn insert_and_deletion() { + let mut conn = get_connection(); let (block_group_id, path_id) = setup_block_group(&mut conn); let insert_sequence = Sequence::create(&mut conn, "DNA".to_string(), &"NNNN".to_string(), true); @@ -558,7 +645,7 @@ mod tests { 4, &"1".to_string(), ); - BlockGroup::insert_change(&mut conn, path_id, 7, 15, insert.id, 1, 0); + BlockGroup::insert_change(&mut conn, path_id, 7, 15, &insert, 1, 0); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( @@ -582,7 +669,7 @@ mod tests { ); // take out an entire block. - BlockGroup::insert_change(&mut conn, path_id, 19, 31, deletion.id, 1, 0); + BlockGroup::insert_change(&mut conn, path_id, 19, 31, &deletion, 1, 0); let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, @@ -594,4 +681,198 @@ mod tests { ]) ) } + + #[test] + fn simple_insert() { + let mut conn = get_connection(); + let (block_group_id, path_id) = setup_block_group(&mut conn); + let insert_sequence = + Sequence::create(&mut conn, "DNA".to_string(), &"NNNN".to_string(), true); + let insert = Block::create( + &conn, + &insert_sequence, + block_group_id, + 0, + 4, + &"1".to_string(), + ); + BlockGroup::insert_change(&mut conn, path_id, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_on_block_boundary_start() { + let mut conn = get_connection(); + let (block_group_id, path_id) = setup_block_group(&mut conn); + let insert_sequence = + Sequence::create(&mut conn, "DNA".to_string(), &"NNNN".to_string(), true); + let insert = Block::create( + &conn, + &insert_sequence, + block_group_id, + 0, + 4, + &"1".to_string(), + ); + BlockGroup::insert_change(&mut conn, path_id, 10, 10, &insert, 1, 0); + + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAANNNNTTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_on_block_boundary_end() { + let mut conn = get_connection(); + let (block_group_id, path_id) = setup_block_group(&mut conn); + let insert_sequence = + Sequence::create(&mut conn, "DNA".to_string(), &"NNNN".to_string(), true); + let insert = Block::create( + &conn, + &insert_sequence, + block_group_id, + 0, + 4, + &"1".to_string(), + ); + BlockGroup::insert_change(&mut conn, path_id, 9, 9, &insert, 1, 0); + + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAANNNNATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_across_entire_block_boundary() { + let mut conn = get_connection(); + let (block_group_id, path_id) = setup_block_group(&mut conn); + let insert_sequence = + Sequence::create(&mut conn, "DNA".to_string(), &"NNNN".to_string(), true); + let insert = Block::create( + &conn, + &insert_sequence, + block_group_id, + 0, + 4, + &"1".to_string(), + ); + BlockGroup::insert_change(&mut conn, path_id, 10, 20, &insert, 1, 0); + + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAANNNNCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn insert_across_two_blocks() { + let mut conn = get_connection(); + let (block_group_id, path_id) = setup_block_group(&mut conn); + let insert_sequence = + Sequence::create(&mut conn, "DNA".to_string(), &"NNNN".to_string(), true); + let insert = Block::create( + &conn, + &insert_sequence, + block_group_id, + 0, + 4, + &"1".to_string(), + ); + BlockGroup::insert_change(&mut conn, path_id, 15, 25, &insert, 1, 0); + + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTNNNNCCCCCGGGGGGGGGG".to_string() + ]) + ); + } + + #[test] + fn simple_deletion() { + let mut conn = get_connection(); + let (block_group_id, path_id) = setup_block_group(&mut conn); + let deletion_sequence = + Sequence::create(&mut conn, "DNA".to_string(), &"".to_string(), true); + let deletion = Block::create( + &conn, + &deletion_sequence, + block_group_id, + 0, + 0, + &"1".to_string(), + ); + + // take out an entire block. + BlockGroup::insert_change(&mut conn, path_id, 19, 31, &deletion, 1, 0); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAAAAATTTTTTTTTGGGGGGGGG".to_string(), + ]) + ) + } + + #[test] + fn doesnt_apply_same_insert_twice() { + let mut conn = get_connection(); + let (block_group_id, path_id) = setup_block_group(&mut conn); + let insert_sequence = + Sequence::create(&mut conn, "DNA".to_string(), &"NNNN".to_string(), true); + let insert = Block::create( + &conn, + &insert_sequence, + block_group_id, + 0, + 4, + &"1".to_string(), + ); + BlockGroup::insert_change(&mut conn, path_id, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + + BlockGroup::insert_change(&mut conn, path_id, 7, 15, &insert, 1, 0); + + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + assert_eq!( + all_sequences, + HashSet::from_iter(vec![ + "AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGG".to_string(), + "AAAAAAANNNNTTTTTCCCCCCCCCCGGGGGGGGGG".to_string() + ]) + ); + } } diff --git a/src/models/block.rs b/src/models/block.rs index bc93891..fea37ff 100644 --- a/src/models/block.rs +++ b/src/models/block.rs @@ -64,6 +64,7 @@ impl Block { } pub fn delete(conn: &Connection, block_id: i32) { + println!("deleting {block_id}"); let mut stmt = conn .prepare_cached("DELETE from block where id = ?1") .unwrap(); @@ -249,6 +250,14 @@ impl Block { } objs } + + pub fn get_sequence(conn: &Connection, block_id: i32) -> (String, String) { + let mut stmt = conn.prepare_cached("select substr(sequence.sequence, block.start + 1, block.end - block.start) as sequence, block.strand from sequence left join block on (block.sequence_hash = sequence.hash) where block.id = ?1").unwrap(); + let mut rows = stmt + .query_map((block_id,), |row| Ok((row.get(0)?, row.get(1)?))) + .unwrap(); + rows.next().unwrap().unwrap() + } } #[cfg(test)] diff --git a/src/models/path.rs b/src/models/path.rs index b1d06fe..5d4a881 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -1,10 +1,11 @@ -use crate::models::edge::Edge; +use crate::models::block::Block; use petgraph::graphmap::DiGraphMap; use petgraph::prelude::Dfs; use petgraph::visit::{IntoNeighborsDirected, NodeCount}; use petgraph::{Direction, Outgoing}; use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; +use sha2::digest::generic_array::sequence; use std::hash::Hash; use std::iter::from_fn; @@ -79,6 +80,16 @@ impl Path { } paths } + + pub fn sequence(conn: &Connection, path_id: i32) -> String { + let block_ids = PathBlock::get_blocks(conn, path_id); + // todo: fix this to handle strand and join order + let mut sequence = "".to_string(); + for block_id in block_ids { + sequence.push_str(&Block::get_sequence(conn, block_id).0); + } + sequence + } } #[derive(Debug)]