Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sequence cache #26

Merged
merged 2 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 91 additions & 28 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ impl<'a> BlockGroupCache<'_> {
block_group_cache.conn,
collection_name,
sample_name,
name.clone(),
&name.clone(),
);
block_group_cache
.cache
Expand All @@ -179,29 +179,74 @@ impl<'a> BlockGroupCache<'_> {
}
}

#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct SequenceKey<'a> {
sequence_type: &'a str,
sequence: String,
}

#[derive(Debug)]
pub struct SequenceCache<'a> {
pub cache: HashMap<SequenceKey<'a>, Sequence>,
pub conn: &'a Connection,
}

impl<'a> SequenceCache<'_> {
pub fn new(conn: &Connection) -> SequenceCache {
SequenceCache {
cache: HashMap::<SequenceKey, Sequence>::new(),
conn,
}
}

pub fn lookup(
sequence_cache: &mut SequenceCache<'a>,
sequence_type: &'a str,
sequence: String,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this will result in ownership issues where String is moved here but not returned out. &String can resolve that

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually fighting the compiler about this yesterday. I couldn't get it to work with &str. I tried with &String just now and got the same problem, which is that the sequence cache exists outside the for loop that's processing the VCF records. The sequence that's being cached gets created inside that for loop, so any reference to it is transient (could go away from one iteration of the for loop to the next), so the sequence reference can't get added to the cache.

) -> Sequence {
let sequence_key = SequenceKey {
sequence_type,
sequence: sequence.clone(),
};
let sequence_lookup = sequence_cache.cache.get(&sequence_key);
if let Some(found_sequence) = sequence_lookup {
found_sequence.clone()
} else {
let new_sequence_hash = Sequence::new()
.sequence_type("DNA")
.sequence(&sequence)
.save(sequence_cache.conn);
let new_sequence = NewSequence::new()
.sequence_type(sequence_type)
.sequence(&sequence)
.build();

sequence_cache
.cache
.insert(sequence_key, new_sequence.clone());
new_sequence
}
}
}

#[allow(clippy::too_many_arguments)]
fn prepare_change(
conn: &Connection,
sample_bg_id: i32,
sample_path: &Path,
alt_seq: &str,
ref_start: i32,
ref_end: i32,
chromosome_index: i32,
phased: i32,
sequence: Sequence,
) -> PathChange {
// TODO: new sequence may not be real and be <DEL> or some sort. Handle these.
let new_sequence_hash = Sequence::new()
.sequence_type("DNA")
.sequence(alt_seq)
.save(conn);
let sequence = Sequence::sequence_from_hash(conn, &new_sequence_hash).unwrap();
let new_block = NewBlock {
id: 0,
sequence: sequence.clone(),
block_sequence: alt_seq.to_string(),
block_sequence: sequence.get_sequence(None, None),
sequence_start: 0,
sequence_end: alt_seq.len() as i32,
sequence_end: sequence.length,
path_start: ref_start,
path_end: ref_end,
strand: "+".to_string(),
Expand All @@ -217,6 +262,14 @@ fn prepare_change(
}
}

struct VcfEntry<'a> {
block_group_id: i32,
path: Path,
alt_seq: &'a str,
chromosome_index: i32,
phased: i32,
}

fn update_with_vcf(
vcf_path: &String,
collection_name: &str,
Expand Down Expand Up @@ -245,6 +298,7 @@ fn update_with_vcf(
// Cache a bunch of data ahead of making changes
let mut block_group_cache = BlockGroupCache::new(conn);
let mut path_cache = PathCache::new(conn);
let mut sequence_cache = SequenceCache::new(conn);

let mut changes: Vec<PathChange> = vec![];

Expand All @@ -257,7 +311,8 @@ fn update_with_vcf(
let ref_end = record.variant_end(&header).unwrap().get();
let alt_bases = record.alternate_bases();
let alt_alleles: Vec<_> = alt_bases.iter().collect::<io::Result<_>>().unwrap();
// TODO: fix this duplication of handling an insert
let mut vcf_entries = vec![];

if !fixed_sample.is_empty() && !genotype.is_empty() {
let sample_bg_id = BlockGroupCache::lookup(
&mut block_group_cache,
Expand All @@ -275,17 +330,13 @@ fn update_with_vcf(
Phasing::Phased => 1,
Phasing::Unphased => 0,
};
let change = prepare_change(
conn,
sample_bg_id,
&sample_path,
vcf_entries.push(VcfEntry {
block_group_id: sample_bg_id,
path: sample_path.clone(),
alt_seq,
ref_start as i32,
ref_end as i32,
chromosome_index as i32,
chromosome_index: chromosome_index as i32,
phased,
);
changes.push(change);
});
}
}
}
Expand Down Expand Up @@ -313,24 +364,36 @@ fn update_with_vcf(
let allele = allele.unwrap();
if allele != 0 {
let alt_seq = alt_alleles[allele - 1];
let change = prepare_change(
conn,
sample_bg_id,
&sample_path,
vcf_entries.push(VcfEntry {
block_group_id: sample_bg_id,
path: sample_path.clone(),
alt_seq,
ref_start as i32,
ref_end as i32,
chromosome_index as i32,
chromosome_index: chromosome_index as i32,
phased,
);
changes.push(change);
});
}
}
}
}
}
}
}

for vcf_entry in vcf_entries {
let sequence =
SequenceCache::lookup(&mut sequence_cache, "DNA", vcf_entry.alt_seq.to_string());
let change = prepare_change(
conn,
vcf_entry.block_group_id,
&vcf_entry.path,
ref_start as i32,
ref_end as i32,
vcf_entry.chromosome_index,
vcf_entry.phased,
sequence,
);
changes.push(change);
}
}

BlockGroup::insert_changes(conn, &changes, &path_cache);
Expand Down
2 changes: 1 addition & 1 deletion src/models/block_group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ impl BlockGroup {
}
}
}
let new_bg_id = BlockGroup::create(conn, collection_name, Some(sample_name), &group_name);
let new_bg_id = BlockGroup::create(conn, collection_name, Some(sample_name), group_name);

// clone parent blocks/edges/path
BlockGroup::clone(conn, bg_id, new_bg_id.id);
Expand Down
2 changes: 1 addition & 1 deletion src/models/sequence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use sha2::{Digest, Sha256};
use std::collections::HashMap;
use std::{fs, path::PathBuf, str};

#[derive(Clone, Debug)]
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct Sequence {
pub hash: String,
pub sequence_type: String,
Expand Down
Loading