Skip to content

Commit

Permalink
Merge pull request #8 from ginkgobioworks/implement-path
Browse files Browse the repository at this point in the history
Path implementation
  • Loading branch information
Chris7 authored Aug 2, 2024
2 parents 64716f0 + 180b866 commit a592b5a
Show file tree
Hide file tree
Showing 9 changed files with 778 additions and 2,465 deletions.
1,722 changes: 0 additions & 1,722 deletions Cargo.lock

This file was deleted.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ rusqlite = { version = "0.31.0", features = ["bundled", "array"] }
rusqlite_migration = { version = "1.2.0" , features = ["from-directory"]}
sha2 = "0.10.8"
noodles = { version = "0.78.0", features = ["vcf", "fasta", "async"] }
petgraph = "0.6.5"
28 changes: 20 additions & 8 deletions migrations/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,6 @@ CREATE TABLE sequence (
"length" INTEGER NOT NULL
);

CREATE TABLE path (
id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
path_index INTEGER NOT NULL DEFAULT 0
);

CREATE TABLE block_group (
id INTEGER PRIMARY KEY NOT NULL,
collection_name TEXT NOT NULL,
Expand All @@ -44,13 +38,31 @@ CREATE UNIQUE INDEX block_uidx ON block(sequence_hash, block_group_id, start, en

CREATE TABLE edges (
id INTEGER PRIMARY KEY NOT NULL,
source_id INTEGER NOT NULL,
source_id INTEGER,
target_id INTEGER,
chromosome_index INTEGER NOT NULL,
phased INTEGER NOT NULL,
FOREIGN KEY(source_id) REFERENCES block(id),
FOREIGN KEY(target_id) REFERENCES block(id),
constraint chk_phased check (phased in (0, 1))
);

CREATE UNIQUE INDEX edge_uidx ON edges(source_id, target_id, chromosome_index, phased);

CREATE TABLE path (
id INTEGER PRIMARY KEY NOT NULL,
block_group_id INTEGER NOT NULL,
name TEXT NOT NULL,
FOREIGN KEY(block_group_id) REFERENCES block_group(id)
);
CREATE UNIQUE INDEX path_uidx ON path(block_group_id, name);

CREATE TABLE path_edges (
id INTEGER PRIMARY KEY NOT NULL,
path_id INTEGER NOT NULL,
source_edge_id INTEGER,
target_edge_id INTEGER,
FOREIGN KEY(source_edge_id) REFERENCES edges(id),
FOREIGN KEY(target_edge_id) REFERENCES edges(id),
FOREIGN KEY(path_id) REFERENCES path(id)
);
CREATE UNIQUE INDEX path_edge_uidx ON path_edges(path_id, source_edge_id, target_edge_id);
38 changes: 29 additions & 9 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ use std::path::PathBuf;
use bio::io::fasta;
use gen::get_connection;
use gen::migrations::run_migrations;
use gen::models::{self, block::Block, edge::Edge, sequence::Sequence, BlockGroup};
use gen::models::{self, block::Block, edge::Edge, path::Path, sequence::Sequence, BlockGroup};
use noodles::vcf;
use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
use noodles::vcf::variant::record::samples::series::Value;
use noodles::vcf::variant::record::samples::{Sample, Series};
use noodles::vcf::variant::record::{AlternateBases, ReferenceBases, Samples};
use noodles::vcf::variant::Record;
use rusqlite::Connection;
use rusqlite::{types::Value as SQLValue, Connection};
use std::io;

#[derive(Parser)]
Expand Down Expand Up @@ -80,7 +80,14 @@ fn import_fasta(fasta: &String, name: &String, shallow: bool, conn: &mut Connect
(sequence.len() as i32),
&"1".to_string(),
);
let edge = Edge::create(conn, block.id, None, 0, 0);
let edge_1 = Edge::create(conn, None, Some(block.id), 0, 0);
let edge_2 = Edge::create(conn, Some(block.id), None, 0, 0);
Path::create(
conn,
record.id(),
block_group.id,
vec![edge_1.id, edge_2.id],
);
}
println!("Created it");
} else {
Expand Down Expand Up @@ -108,10 +115,8 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne
let ref_end = record.variant_end(&header).unwrap().get();
let alt_bases = record.alternate_bases();
let alt_alleles: Vec<_> = alt_bases.iter().collect::<io::Result<_>>().unwrap();
let mut created: HashSet<i32> = HashSet::new();
for (sample_index, sample) in record.samples().iter().enumerate() {
let genotype = sample.get(&header, "GT");
let mut allele_blocks: HashMap<i32, i32> = HashMap::new();
if genotype.is_some() {
if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() {
for (chromosome_index, gt) in genotypes.iter().enumerate() {
Expand All @@ -137,6 +142,14 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne
&sample_names[sample_index],
&seq_name,
);
let sample_path_id = Path::get_paths(
conn,
"select * from path where block_group_id = ?1 AND name = ?2",
vec![
SQLValue::from(sample_bg_id),
SQLValue::from(seq_name.clone()),
],
);
let new_block_id = Block::create(
conn,
&new_sequence_hash,
Expand All @@ -145,10 +158,9 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne
alt_seq.len() as i32,
&"1".to_string(),
);
println!("{sample_bg_id} {new_block_id:?} {chromosome_index} {phased} {allele}");
BlockGroup::insert_change(
conn,
sample_bg_id,
sample_path_id[0].id,
ref_start as i32,
ref_end as i32,
new_block_id.id,
Expand Down Expand Up @@ -227,8 +239,16 @@ mod tests {
);
update_with_vcf(&vcf_path.to_str().unwrap().to_string(), &collection, conn);
assert_eq!(
BlockGroup::sequence(conn, &collection, Some(&"foo".to_string()), "m123"),
"ATCATCGATCGATCGATCGGGAACACACAGAGA"
BlockGroup::get_all_sequences(conn, 1),
HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()])
);
assert_eq!(
BlockGroup::get_all_sequences(conn, 2),
HashSet::from_iter(vec!["ATCATCGATAGAGATCGATCGGGAACACACAGAGA".to_string()])
);
assert_eq!(
BlockGroup::get_all_sequences(conn, 3),
HashSet::from_iter(vec!["ATCATCGATCGATCGATCGGGAACACACAGAGA".to_string()])
);
}
}
Loading

0 comments on commit a592b5a

Please sign in to comment.