Skip to content

Commit

Permalink
Add sequence cache
Browse files Browse the repository at this point in the history
  • Loading branch information
dkhofer committed Aug 28, 2024
1 parent 0ebe206 commit 5f240fb
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 28 deletions.
115 changes: 88 additions & 27 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,29 +179,72 @@ impl<'a> BlockGroupCache<'_> {
}
}

#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct SequenceKey<'a> {
sequence_type: &'a str,
sequence: String,
}

#[derive(Debug)]
pub struct SequenceCache<'a> {
pub cache: HashMap<SequenceKey<'a>, Sequence>,
pub conn: &'a Connection,
}

impl<'a> SequenceCache<'_> {
pub fn new(conn: &Connection) -> SequenceCache {
SequenceCache {
cache: HashMap::<SequenceKey, Sequence>::new(),
conn,
}
}

pub fn lookup(
sequence_cache: &mut SequenceCache<'a>,
sequence_type: &'a str,
sequence: String,
) -> Sequence {
let sequence_key = SequenceKey {
sequence_type,
sequence: sequence.clone(),
};
let sequence_lookup = sequence_cache.cache.get(&sequence_key);
if let Some(found_sequence) = sequence_lookup {
found_sequence.clone()
} else {
let new_sequence_hash = Sequence::new()
.sequence_type("DNA")
.sequence(&sequence)
.save(sequence_cache.conn);
let new_sequence =
Sequence::sequence_from_hash(sequence_cache.conn, &new_sequence_hash).unwrap();

sequence_cache
.cache
.insert(sequence_key, new_sequence.clone());
new_sequence
}
}
}

#[allow(clippy::too_many_arguments)]
fn prepare_change(
conn: &Connection,
sample_bg_id: i32,
sample_path: &Path,
alt_seq: &str,
ref_start: i32,
ref_end: i32,
chromosome_index: i32,
phased: i32,
sequence: Sequence,
) -> PathChange {
// TODO: new sequence may not be real and be <DEL> or some sort. Handle these.
let new_sequence_hash = Sequence::new()
.sequence_type("DNA")
.sequence(alt_seq)
.save(conn);
let sequence = Sequence::sequence_from_hash(conn, &new_sequence_hash).unwrap();
let new_block = NewBlock {
id: 0,
sequence: sequence.clone(),
block_sequence: alt_seq.to_string(),
block_sequence: sequence.get_sequence(None, None),
sequence_start: 0,
sequence_end: alt_seq.len() as i32,
sequence_end: sequence.length,
path_start: ref_start,
path_end: ref_end,
strand: "+".to_string(),
Expand All @@ -217,6 +260,14 @@ fn prepare_change(
}
}

struct VcfEntry<'a> {
block_group_id: i32,
path: Path,
alt_seq: &'a str,
chromosome_index: i32,
phased: i32,
}

fn update_with_vcf(
vcf_path: &String,
collection_name: &str,
Expand Down Expand Up @@ -245,6 +296,7 @@ fn update_with_vcf(
// Cache a bunch of data ahead of making changes
let mut block_group_cache = BlockGroupCache::new(conn);
let mut path_cache = PathCache::new(conn);
let mut sequence_cache = SequenceCache::new(conn);

let mut changes: Vec<PathChange> = vec![];

Expand All @@ -257,7 +309,8 @@ fn update_with_vcf(
let ref_end = record.variant_end(&header).unwrap().get();
let alt_bases = record.alternate_bases();
let alt_alleles: Vec<_> = alt_bases.iter().collect::<io::Result<_>>().unwrap();
// TODO: fix this duplication of handling an insert
let mut vcf_entries = vec![];

if !fixed_sample.is_empty() && !genotype.is_empty() {
let sample_bg_id = BlockGroupCache::lookup(
&mut block_group_cache,
Expand All @@ -275,17 +328,13 @@ fn update_with_vcf(
Phasing::Phased => 1,
Phasing::Unphased => 0,
};
let change = prepare_change(
conn,
sample_bg_id,
&sample_path,
vcf_entries.push(VcfEntry {
block_group_id: sample_bg_id,
path: sample_path.clone(),
alt_seq,
ref_start as i32,
ref_end as i32,
chromosome_index as i32,
chromosome_index: chromosome_index as i32,
phased,
);
changes.push(change);
});
}
}
}
Expand Down Expand Up @@ -313,24 +362,36 @@ fn update_with_vcf(
let allele = allele.unwrap();
if allele != 0 {
let alt_seq = alt_alleles[allele - 1];
let change = prepare_change(
conn,
sample_bg_id,
&sample_path,
vcf_entries.push(VcfEntry {
block_group_id: sample_bg_id,
path: sample_path.clone(),
alt_seq,
ref_start as i32,
ref_end as i32,
chromosome_index as i32,
chromosome_index: chromosome_index as i32,
phased,
);
changes.push(change);
});
}
}
}
}
}
}
}

for vcf_entry in vcf_entries {
let sequence =
SequenceCache::lookup(&mut sequence_cache, "DNA", vcf_entry.alt_seq.to_string());
let change = prepare_change(
conn,
vcf_entry.block_group_id,
&vcf_entry.path,
ref_start as i32,
ref_end as i32,
vcf_entry.chromosome_index,
vcf_entry.phased,
sequence,
);
changes.push(change);
}
}

BlockGroup::insert_changes(conn, &changes, &path_cache);
Expand Down
2 changes: 1 addition & 1 deletion src/models/sequence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use sha2::{Digest, Sha256};
use std::collections::HashMap;
use std::{fs, path::PathBuf, str};

#[derive(Clone, Debug)]
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct Sequence {
pub hash: String,
pub sequence_type: String,
Expand Down

0 comments on commit 5f240fb

Please sign in to comment.