Skip to content

Commit

Permalink
Merge pull request #62 from ginkgobioworks/blockgroup-deferred-sequnece
Browse files Browse the repository at this point in the history
Defer sequence fetching for groupblock
  • Loading branch information
Chris7 authored Oct 2, 2024
2 parents 1555734 + c2c65f3 commit c6f5d28
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 51 deletions.
4 changes: 2 additions & 2 deletions src/exports/gfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ fn write_segments(
continue;
}
writer
.write_all(&segment_line(&block.sequence, block.id as usize).into_bytes())
.write_all(&segment_line(&block.sequence(), block.id as usize).into_bytes())
.unwrap_or_else(|_| {
panic!(
"Error writing segment with sequence {} to GFA stream",
block.sequence,
block.sequence(),
)
});
}
Expand Down
4 changes: 2 additions & 2 deletions src/models/block_group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,14 +283,14 @@ impl BlockGroup {
if start_node == *end_node {
let block = blocks_by_id.get(&start_node).unwrap();
if block.node_id != PATH_START_NODE_ID && block.node_id != PATH_END_NODE_ID {
sequences.insert(block.sequence.clone());
sequences.insert(block.sequence());
}
} else {
for path in all_simple_paths(&graph, start_node, *end_node) {
let mut current_sequence = "".to_string();
for node in path {
let block = blocks_by_id.get(&node).unwrap();
let block_sequence = block.sequence.clone();
let block_sequence = block.sequence();
current_sequence.push_str(&block_sequence);
}
sequences.insert(current_sequence);
Expand Down
108 changes: 62 additions & 46 deletions src/models/edge.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, RandomState};

use itertools::Itertools;
use petgraph::graphmap::DiGraphMap;
use rusqlite::types::Value;
use rusqlite::{params_from_iter, Connection, Result as SQLResult, Row};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, RandomState};

use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID};
use crate::models::sequence::{cached_sequence, Sequence};
use crate::models::strand::Strand;

#[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)]
Expand Down Expand Up @@ -60,11 +60,45 @@ pub struct BlockKey {
pub struct GroupBlock {
pub id: i64,
pub node_id: i64,
pub sequence: String,
sequence: Option<String>,
external_sequence: Option<(String, String)>,
pub start: i64,
pub end: i64,
}

impl GroupBlock {
pub fn new(id: i64, node_id: i64, sequence: &Sequence, start: i64, end: i64) -> Self {
if sequence.external_sequence {
GroupBlock {
id,
node_id,
sequence: None,
external_sequence: Some((sequence.file_path.clone(), sequence.name.clone())),
start,
end,
}
} else {
GroupBlock {
id,
node_id,
sequence: Some(sequence.get_sequence(start, end)),
external_sequence: None,
start,
end,
}
}
}
pub fn sequence(&self) -> String {
if let Some(sequence) = &self.sequence {
sequence.to_string()
} else if let Some((path, name)) = &self.external_sequence {
cached_sequence(path, name, self.start as usize, self.end as usize).unwrap()
} else {
panic!("Sequence or external sequence is not set.")
}
}
}

impl Edge {
#[allow(clippy::too_many_arguments)]
pub fn create(
Expand Down Expand Up @@ -343,45 +377,27 @@ impl Edge {
if !block_boundaries.is_empty() {
let start = 0;
let end = block_boundaries[0];
let first_block = GroupBlock {
id: block_index,
node_id: *node_id,
sequence: sequence.get_sequence(start, end),
start,
end,
};
let first_block = GroupBlock::new(block_index, *node_id, sequence, start, end);
blocks.push(first_block);
block_index += 1;
for (start, end) in block_boundaries.clone().into_iter().tuple_windows() {
let block = GroupBlock {
id: block_index,
node_id: *node_id,
sequence: sequence.get_sequence(start, end),
start,
end,
};
let block = GroupBlock::new(block_index, *node_id, sequence, start, end);
blocks.push(block);
block_index += 1;
}
let start = block_boundaries[block_boundaries.len() - 1];
let end = sequence.length;
let last_block = GroupBlock {
id: block_index,
node_id: *node_id,
sequence: sequence.get_sequence(start, end),
start,
end,
};
let last_block = GroupBlock::new(block_index, *node_id, sequence, start, end);
blocks.push(last_block);
block_index += 1;
} else {
blocks.push(GroupBlock {
id: block_index,
node_id: *node_id,
sequence: sequence.get_sequence(None, None),
start: 0,
end: sequence.length,
});
blocks.push(GroupBlock::new(
block_index,
*node_id,
sequence,
0,
sequence.length,
));
block_index += 1;
}
}
Expand All @@ -390,21 +406,21 @@ impl Edge {
// block group, since different paths in the block group may start or end at different
// places on sequences. These two "start sequence" and "end sequence" blocks will serve
// that role.
let start_block = GroupBlock {
id: block_index + 1,
node_id: PATH_START_NODE_ID,
sequence: "".to_string(),
start: 0,
end: 0,
};
let start_block = GroupBlock::new(
block_index + 1,
PATH_START_NODE_ID,
&Sequence::new().sequence_type("DNA").sequence("").build(),
0,
0,
);
blocks.push(start_block);
let end_block = GroupBlock {
id: block_index + 2,
node_id: PATH_END_NODE_ID,
sequence: "".to_string(),
start: 0,
end: 0,
};
let end_block = GroupBlock::new(
block_index + 2,
PATH_END_NODE_ID,
&Sequence::new().sequence_type("DNA").sequence("").build(),
0,
0,
);
blocks.push(end_block);
(blocks, boundary_edges)
}
Expand Down
2 changes: 1 addition & 1 deletion src/models/sequence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ fn fasta_gzi_index(path: &str) -> Option<gzi::Index> {
None
}

fn cached_sequence(file_path: &str, name: &str, start: usize, end: usize) -> Option<String> {
pub fn cached_sequence(file_path: &str, name: &str, start: usize, end: usize) -> Option<String> {
static SEQUENCE_CACHE: sync::LazyLock<sync::RwLock<HashMap<String, Option<String>>>> =
sync::LazyLock::new(|| sync::RwLock::new(HashMap::new()));
let key = format!("{file_path}-{name}");
Expand Down

0 comments on commit c6f5d28

Please sign in to comment.