Skip to content

Commit

Permalink
New methods to generate sequence from path of new edges
Browse files Browse the repository at this point in the history
  • Loading branch information
dkhofer committed Aug 19, 2024
1 parent b1a09d9 commit e320d77
Show file tree
Hide file tree
Showing 9 changed files with 375 additions and 75 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2021"
bio = "2.0.0"
clap = { version = "4.5.8", features = ["derive"] }
include_dir = "0.7.4"
itertools = "0.13.0"
rusqlite = { version = "0.31.0", features = ["bundled", "array"] }
rusqlite_migration = { version = "1.2.0" , features = ["from-directory"]}
sha2 = "0.10.8"
Expand Down
9 changes: 4 additions & 5 deletions migrations/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,9 @@ CREATE UNIQUE INDEX new_edge_uidx ON new_edges(source_hash, source_coordinate, t
CREATE TABLE path_edges (
id INTEGER PRIMARY KEY NOT NULL,
path_id INTEGER NOT NULL,
source_edge_id INTEGER,
target_edge_id INTEGER,
FOREIGN KEY(source_edge_id) REFERENCES new_edges(id),
FOREIGN KEY(target_edge_id) REFERENCES new_edges(id),
index_in_path INTEGER NOT NULL,
edge_id INTEGER NOT NULL,
FOREIGN KEY(edge_id) REFERENCES new_edges(id),
FOREIGN KEY(path_id) REFERENCES path(id)
);
CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, source_edge_id, target_edge_id);
CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id);
2 changes: 1 addition & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ mod tests {
HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()])
);
assert_eq!(
Path::sequence(&conn, 1),
Path::get_sequence(&conn, 1),
"ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()
);
}
Expand Down
28 changes: 10 additions & 18 deletions src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,16 +277,8 @@ impl BlockGroup {
let sequence_hashes = block_map
.values()
.map(|block| format!("\"{id}\"", id = block.sequence_hash))
.collect::<Vec<_>>()
.join(",");
let mut sequence_map = HashMap::new();
for sequence in Sequence::get_sequences(
conn,
&format!("select * from sequence where hash in ({sequence_hashes})"),
vec![],
) {
sequence_map.insert(sequence.hash, sequence.sequence);
}
.collect::<Vec<_>>();
let sequence_map = Sequence::get_sequences_by_hash(conn, sequence_hashes);
let block_ids = block_map
.keys()
.map(|id| format!("{id}"))
Expand Down Expand Up @@ -323,7 +315,8 @@ impl BlockGroup {
let block = block_map.get(&start_node).unwrap();
let block_sequence = sequence_map.get(&block.sequence_hash).unwrap();
sequences.insert(
block_sequence[(block.start as usize)..(block.end as usize)].to_string(),
block_sequence.sequence[(block.start as usize)..(block.end as usize)]
.to_string(),
);
} else {
for path in all_simple_paths(&graph, start_node, *end_node) {
Expand All @@ -332,7 +325,8 @@ impl BlockGroup {
let block = block_map.get(&node).unwrap();
let block_sequence = sequence_map.get(&block.sequence_hash).unwrap();
current_sequence.push_str(
&block_sequence[(block.start as usize)..(block.end as usize)],
&block_sequence.sequence
[(block.start as usize)..(block.end as usize)],
);
}
sequences.insert(current_sequence);
Expand Down Expand Up @@ -428,20 +422,19 @@ impl BlockGroup {
// |----range---|
let start_split_point = block.start + start - path_start;
let end_split_point = block.start + end - path_start;
let mut next_block;
if start_split_point == block.start {
let next_block = if start_split_point == block.start {
if let Some(pb) = previous_block {
new_edges.push((Some(pb.id), Some(new_block_id)));
}
next_block = block.clone();
block.clone()
} else {
let (left_block, right_block) =
Block::split(conn, block, start_split_point, chromosome_index, phased)
.unwrap();
Block::delete(conn, block.id);
new_edges.push((Some(left_block.id), Some(new_block_id)));
next_block = right_block.clone();
}
right_block.clone()
};

if end_split_point == next_block.start {
new_edges.push((Some(new_block_id), Some(next_block.id)));
Expand Down Expand Up @@ -585,7 +578,6 @@ impl ChangeLog {
mod tests {
use super::*;
use crate::migrations::run_migrations;
use std::hash::Hash;

fn get_connection() -> Connection {
let mut conn = Connection::open_in_memory()
Expand Down
30 changes: 29 additions & 1 deletion src/models/new_edge.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use rusqlite::types::Value;
use rusqlite::{params_from_iter, Connection};

#[derive(Debug)]
#[derive(Clone, Debug)]
pub struct NewEdge {
pub id: i32,
pub source_hash: Option<String>,
Expand Down Expand Up @@ -85,4 +85,32 @@ impl NewEdge {
}
}
}

pub fn load(conn: &Connection, edge_ids: Vec<i32>) -> Vec<NewEdge> {
let formatted_edge_ids = edge_ids
.into_iter()
.map(|edge_id| edge_id.to_string())
.collect::<Vec<_>>()
.join(",");
let query = format!("select id, source_hash, source_coordinate, target_hash, target_coordinate, chromosome_index, phased from new_edges where id in ({});", formatted_edge_ids);
let mut stmt = conn.prepare_cached(&query).unwrap();
let rows = stmt
.query_map([], |row| {
Ok(NewEdge {
id: row.get(0)?,
source_hash: row.get(1)?,
source_coordinate: row.get(2)?,
target_hash: row.get(3)?,
target_coordinate: row.get(4)?,
chromosome_index: row.get(5)?,
phased: row.get(6)?,
})
})
.unwrap();
let mut objs = vec![];
for row in rows {
objs.push(row.unwrap());
}
objs
}
}
144 changes: 132 additions & 12 deletions src/models/path.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
use crate::models::{block::Block, edge::Edge, path_edge::PathEdge};
use crate::models::{block::Block, new_edge::NewEdge, path_edge::PathEdge, sequence::Sequence};
use petgraph::graphmap::DiGraphMap;
use petgraph::prelude::Dfs;
use petgraph::Direction;
use rusqlite::types::Value;
use rusqlite::{params_from_iter, Connection};
use std::collections::{HashMap, HashSet};

use itertools::Itertools;

#[derive(Debug)]
pub struct Path {
Expand Down Expand Up @@ -49,10 +52,12 @@ pub fn revcomp(seq: &str) -> String {
#[derive(Clone, Debug)]
pub struct NewBlock {
pub id: i32,
pub sequence_hash: String,
pub sequence: Sequence,
pub block_sequence: String,
pub start: i32,
pub end: i32,
pub sequence_start: i32,
pub sequence_end: i32,
pub path_start: i32,
pub path_end: i32,
pub strand: String,
}

Expand Down Expand Up @@ -84,6 +89,33 @@ impl Path {
path
}

pub fn new_create(
conn: &Connection,
name: &str,
block_group_id: i32,
edge_ids: Vec<i32>,
) -> Path {
let query = "INSERT INTO path (name, block_group_id) VALUES (?1, ?2) RETURNING (id)";
let mut stmt = conn.prepare(query).unwrap();
let mut rows = stmt
.query_map((name, block_group_id), |row| {
Ok(Path {
id: row.get(0)?,
name: name.to_string(),
block_group_id,
blocks: vec![],
})
})
.unwrap();
let path = rows.next().unwrap().unwrap();

for (index, edge_id) in edge_ids.iter().enumerate() {
PathEdge::create(conn, path.id, index.try_into().unwrap(), *edge_id);
}

path
}

pub fn get(conn: &mut Connection, path_id: i32) -> Path {
let query = "SELECT id, block_group_id, name from path where id = ?1;";
let mut stmt = conn.prepare(query).unwrap();
Expand Down Expand Up @@ -120,7 +152,7 @@ impl Path {
paths
}

pub fn sequence(conn: &Connection, path_id: i32) -> String {
pub fn get_sequence(conn: &Connection, path_id: i32) -> String {
let block_ids = PathBlock::get_blocks(conn, path_id);
let mut sequence = "".to_string();
for block_id in block_ids {
Expand All @@ -134,10 +166,98 @@ impl Path {
sequence
}

pub fn get_new_blocks(conn: &Connection, path_id: i32) -> Vec<NewBlock> {
let mut new_blocks = vec![];
let edges = PathEdge::edges_for(conn, path_id);
new_blocks
pub fn new_get_sequence(conn: &Connection, path: Path) -> String {
let blocks = Path::blocks_for(conn, path);
blocks
.into_iter()
.map(|block| block.block_sequence)
.collect::<Vec<_>>()
.join("")
}

pub fn edge_pairs_to_block(
block_id: i32,
path: &Path,
into: NewEdge,
out_of: NewEdge,
sequences_by_hash: &HashMap<String, Sequence>,
current_path_length: i32,
) -> NewBlock {
if into.target_hash.is_none() || out_of.source_hash.is_none() {
panic!(
"Consecutive edges in path {} have None as internal block sequence",
path.id
);
}

if into.target_hash != out_of.source_hash {
panic!(
"Consecutive edges in path {0} don't share the same block",
path.id
);
}

let sequence = sequences_by_hash.get(&into.target_hash.unwrap()).unwrap();
let start = into.target_coordinate.unwrap();
let end = out_of.source_coordinate.unwrap();

let strand;
let block_sequence;

if end >= start {
strand = "+";
block_sequence = sequence.sequence[start as usize..end as usize].to_string();
} else {
strand = "-";
block_sequence = revcomp(&sequence.sequence[end as usize..start as usize + 1]);
}

NewBlock {
id: block_id,
sequence: sequence.clone(),
block_sequence,
sequence_start: start,
sequence_end: end,
path_start: current_path_length,
path_end: current_path_length + end,
strand: strand.to_string(),
}
}

pub fn blocks_for(conn: &Connection, path: Path) -> Vec<NewBlock> {
let edges = PathEdge::edges_for(conn, path.id);
let mut sequence_hashes = HashSet::new();
for edge in &edges {
if edge.source_hash.is_some() {
sequence_hashes.insert(edge.source_hash.clone().unwrap());
}
if edge.target_hash.is_some() {
sequence_hashes.insert(edge.target_hash.clone().unwrap());
}
}
let sequences_by_hash = Sequence::get_sequences_by_hash(
conn,
sequence_hashes
.into_iter()
.map(|hash| format!("\"{hash}\""))
.collect(),
);

let mut blocks = vec![];
let mut path_length = 0;
for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() {
let block = Path::edge_pairs_to_block(
index as i32,
&path,
into,
out_of,
&sequences_by_hash,
path_length,
);
path_length += block.block_sequence.len() as i32;
blocks.push(block);
}
blocks
}
}

Expand Down Expand Up @@ -290,7 +410,7 @@ mod tests {
use super::*;

use crate::migrations::run_migrations;
use crate::models::{sequence::Sequence, BlockGroup, Collection};
use crate::models::{sequence::Sequence, BlockGroup, Collection, Edge};

fn get_connection() -> Connection {
let mut conn = Connection::open_in_memory()
Expand Down Expand Up @@ -323,7 +443,7 @@ mod tests {
block_group.id,
vec![block1.id, block2.id, block3.id],
);
assert_eq!(Path::sequence(conn, path.id), "ATCGATCGAAAAAAACCCCCCC");
assert_eq!(Path::get_sequence(conn, path.id), "ATCGATCGAAAAAAACCCCCCC");
}

#[test]
Expand All @@ -349,7 +469,7 @@ mod tests {
block_group.id,
vec![block3.id, block2.id, block1.id],
);
assert_eq!(Path::sequence(conn, path.id), "GGGGGGGTTTTTTTCGATCGAT");
assert_eq!(Path::get_sequence(conn, path.id), "GGGGGGGTTTTTTTCGATCGAT");
}

#[test]
Expand Down
Loading

0 comments on commit e320d77

Please sign in to comment.