From 8a31631c056e838ca08b4b46df444839e6be5924 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 28 Aug 2024 16:56:25 -0400 Subject: [PATCH 1/2] Add sequence cache --- src/main.rs | 115 +++++++++++++++++++++++++++++++---------- src/models/sequence.rs | 2 +- 2 files changed, 89 insertions(+), 28 deletions(-) diff --git a/src/main.rs b/src/main.rs index 114b5f2..bac9216 100644 --- a/src/main.rs +++ b/src/main.rs @@ -179,29 +179,72 @@ impl<'a> BlockGroupCache<'_> { } } +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct SequenceKey<'a> { + sequence_type: &'a str, + sequence: String, +} + +#[derive(Debug)] +pub struct SequenceCache<'a> { + pub cache: HashMap, Sequence>, + pub conn: &'a Connection, +} + +impl<'a> SequenceCache<'_> { + pub fn new(conn: &Connection) -> SequenceCache { + SequenceCache { + cache: HashMap::::new(), + conn, + } + } + + pub fn lookup( + sequence_cache: &mut SequenceCache<'a>, + sequence_type: &'a str, + sequence: String, + ) -> Sequence { + let sequence_key = SequenceKey { + sequence_type, + sequence: sequence.clone(), + }; + let sequence_lookup = sequence_cache.cache.get(&sequence_key); + if let Some(found_sequence) = sequence_lookup { + found_sequence.clone() + } else { + let new_sequence_hash = Sequence::new() + .sequence_type("DNA") + .sequence(&sequence) + .save(sequence_cache.conn); + let new_sequence = + Sequence::sequence_from_hash(sequence_cache.conn, &new_sequence_hash).unwrap(); + + sequence_cache + .cache + .insert(sequence_key, new_sequence.clone()); + new_sequence + } + } +} + #[allow(clippy::too_many_arguments)] fn prepare_change( conn: &Connection, sample_bg_id: i32, sample_path: &Path, - alt_seq: &str, ref_start: i32, ref_end: i32, chromosome_index: i32, phased: i32, + sequence: Sequence, ) -> PathChange { // TODO: new sequence may not be real and be or some sort. Handle these. - let new_sequence_hash = Sequence::new() - .sequence_type("DNA") - .sequence(alt_seq) - .save(conn); - let sequence = Sequence::sequence_from_hash(conn, &new_sequence_hash).unwrap(); let new_block = NewBlock { id: 0, sequence: sequence.clone(), - block_sequence: alt_seq.to_string(), + block_sequence: sequence.get_sequence(None, None), sequence_start: 0, - sequence_end: alt_seq.len() as i32, + sequence_end: sequence.length, path_start: ref_start, path_end: ref_end, strand: "+".to_string(), @@ -217,6 +260,14 @@ fn prepare_change( } } +struct VcfEntry<'a> { + block_group_id: i32, + path: Path, + alt_seq: &'a str, + chromosome_index: i32, + phased: i32, +} + fn update_with_vcf( vcf_path: &String, collection_name: &str, @@ -245,6 +296,7 @@ fn update_with_vcf( // Cache a bunch of data ahead of making changes let mut block_group_cache = BlockGroupCache::new(conn); let mut path_cache = PathCache::new(conn); + let mut sequence_cache = SequenceCache::new(conn); let mut changes: Vec = vec![]; @@ -257,7 +309,8 @@ fn update_with_vcf( let ref_end = record.variant_end(&header).unwrap().get(); let alt_bases = record.alternate_bases(); let alt_alleles: Vec<_> = alt_bases.iter().collect::>().unwrap(); - // TODO: fix this duplication of handling an insert + let mut vcf_entries = vec![]; + if !fixed_sample.is_empty() && !genotype.is_empty() { let sample_bg_id = BlockGroupCache::lookup( &mut block_group_cache, @@ -275,17 +328,13 @@ fn update_with_vcf( Phasing::Phased => 1, Phasing::Unphased => 0, }; - let change = prepare_change( - conn, - sample_bg_id, - &sample_path, + vcf_entries.push(VcfEntry { + block_group_id: sample_bg_id, + path: sample_path.clone(), alt_seq, - ref_start as i32, - ref_end as i32, - chromosome_index as i32, + chromosome_index: chromosome_index as i32, phased, - ); - changes.push(change); + }); } } } @@ -313,17 +362,13 @@ fn update_with_vcf( let allele = allele.unwrap(); if allele != 0 { let alt_seq = alt_alleles[allele - 1]; - let change = prepare_change( - conn, - sample_bg_id, - &sample_path, + vcf_entries.push(VcfEntry { + block_group_id: sample_bg_id, + path: sample_path.clone(), alt_seq, - ref_start as i32, - ref_end as i32, - chromosome_index as i32, + chromosome_index: chromosome_index as i32, phased, - ); - changes.push(change); + }); } } } @@ -331,6 +376,22 @@ fn update_with_vcf( } } } + + for vcf_entry in vcf_entries { + let sequence = + SequenceCache::lookup(&mut sequence_cache, "DNA", vcf_entry.alt_seq.to_string()); + let change = prepare_change( + conn, + vcf_entry.block_group_id, + &vcf_entry.path, + ref_start as i32, + ref_end as i32, + vcf_entry.chromosome_index, + vcf_entry.phased, + sequence, + ); + changes.push(change); + } } BlockGroup::insert_changes(conn, &changes, &path_cache); diff --git a/src/models/sequence.rs b/src/models/sequence.rs index e0941e0..626bfa4 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -6,7 +6,7 @@ use sha2::{Digest, Sha256}; use std::collections::HashMap; use std::{fs, path::PathBuf, str}; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct Sequence { pub hash: String, pub sequence_type: String, From 9ea03690397232027e29805f5b7ecd0efb77cf97 Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 29 Aug 2024 16:11:21 -0400 Subject: [PATCH 2/2] Follow suggestion --- src/main.rs | 8 +++++--- src/models/block_group.rs | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index bac9216..3c77e61 100644 --- a/src/main.rs +++ b/src/main.rs @@ -169,7 +169,7 @@ impl<'a> BlockGroupCache<'_> { block_group_cache.conn, collection_name, sample_name, - name.clone(), + &name.clone(), ); block_group_cache .cache @@ -216,8 +216,10 @@ impl<'a> SequenceCache<'_> { .sequence_type("DNA") .sequence(&sequence) .save(sequence_cache.conn); - let new_sequence = - Sequence::sequence_from_hash(sequence_cache.conn, &new_sequence_hash).unwrap(); + let new_sequence = NewSequence::new() + .sequence_type(sequence_type) + .sequence(&sequence) + .build(); sequence_cache .cache diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 5f0900a..bd1fa4c 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -197,7 +197,7 @@ impl BlockGroup { } } } - let new_bg_id = BlockGroup::create(conn, collection_name, Some(sample_name), &group_name); + let new_bg_id = BlockGroup::create(conn, collection_name, Some(sample_name), group_name); // clone parent blocks/edges/path BlockGroup::clone(conn, bg_id, new_bg_id.id);