Skip to content

Commit

Permalink
Merge pull request #7 from ginkgobioworks/implement-blockgroup
Browse files Browse the repository at this point in the history
Implement block group
  • Loading branch information
dkhofer authored Jul 30, 2024
2 parents e31527d + c84cf6a commit 229ebd4
Show file tree
Hide file tree
Showing 3 changed files with 585 additions and 107 deletions.
22 changes: 15 additions & 7 deletions migrations/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,43 @@ CREATE TABLE sequence (
);

CREATE TABLE path (
id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
path_index INTEGER NOT NULL DEFAULT 0
);

CREATE TABLE block_group (
id INTEGER PRIMARY KEY NOT NULL,
collection_name TEXT NOT NULL,
sample_name TEXT,
name TEXT NOT NULL,
path_index INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY(collection_name) REFERENCES collection(name),
FOREIGN KEY(sample_name) REFERENCES sample(name)
);
CREATE UNIQUE INDEX path_uidx ON path(collection_name, sample_name, name, path_index);
CREATE UNIQUE INDEX block_group_uidx ON block_group(collection_name, sample_name, name);

CREATE TABLE block (
id INTEGER PRIMARY KEY NOT NULL,
sequence_hash TEXT NOT NULL,
path_id INTEGER NOT NULL,
block_group_id INTEGER NOT NULL,
"start" INTEGER NOT NULL,
"end" INTEGER NOT NULL,
strand TEXT NOT NULL DEFAULT "1",
FOREIGN KEY(sequence_hash) REFERENCES sequence(hash),
FOREIGN KEY(path_id) REFERENCES path(id),
FOREIGN KEY(block_group_id) REFERENCES block_group(id),
constraint chk_strand check (strand in ('-1', '1', '0', '.', '?'))
);
CREATE UNIQUE INDEX block_uidx ON block(sequence_hash, path_id, start, end, strand);
CREATE UNIQUE INDEX block_uidx ON block(sequence_hash, block_group_id, start, end, strand);

CREATE TABLE edges (
id INTEGER PRIMARY KEY NOT NULL,
source_id INTEGER NOT NULL,
target_id INTEGER,
chromosome_index INTEGER NOT NULL,
phased INTEGER NOT NULL,
FOREIGN KEY(source_id) REFERENCES block(id),
FOREIGN KEY(target_id) REFERENCES block(id)
FOREIGN KEY(target_id) REFERENCES block(id),
constraint chk_phased check (phased in (0, 1))
);

CREATE UNIQUE INDEX edge_uidx ON edges(source_id, target_id);
CREATE UNIQUE INDEX edge_uidx ON edges(source_id, target_id, chromosome_index, phased);
47 changes: 27 additions & 20 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#![allow(warnings)]
use clap::{Parser, Subcommand};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::fmt::Debug;
use std::path::PathBuf;

use bio::io::fasta;
use gen::get_connection;
use gen::migrations::run_migrations;
use gen::models::{self, Block, Path};
use gen::models::{self, Block, BlockGroup, Sequence};
use noodles::vcf;
use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
use noodles::vcf::variant::record::samples::series::Value;
use noodles::vcf::variant::record::samples::{Sample, Series};
use noodles::vcf::variant::record::{AlternateBases, ReferenceBases, Samples};
Expand Down Expand Up @@ -69,17 +70,17 @@ fn import_fasta(fasta: &String, name: &String, shallow: bool, conn: &mut Connect
let record = result.expect("Error during fasta record parsing");
let sequence = String::from_utf8(record.seq().to_vec()).unwrap();
let seq_hash = models::Sequence::create(conn, "DNA".to_string(), &sequence, !shallow);
let path =
models::Path::create(conn, &collection.name, None, &record.id().to_string(), None);
let block_group =
BlockGroup::create(conn, &collection.name, None, &record.id().to_string());
let block = Block::create(
conn,
&seq_hash,
path.id,
block_group.id,
0,
(sequence.len() as i32),
&"1".to_string(),
);
let edge = models::Edge::create(conn, block.id, None);
let edge = models::Edge::create(conn, block.id, None, 0, 0);
}
println!("Created it");
} else {
Expand Down Expand Up @@ -107,48 +108,54 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne
let ref_end = record.variant_end(&header).unwrap().get();
let alt_bases = record.alternate_bases();
let alt_alleles: Vec<_> = alt_bases.iter().collect::<io::Result<_>>().unwrap();
let mut created: HashSet<i32> = HashSet::new();
for (sample_index, sample) in record.samples().iter().enumerate() {
let genotype = sample.get(&header, "GT");
let mut seen_haplotypes: HashSet<i32> = HashSet::new();
let mut allele_blocks: HashMap<i32, i32> = HashMap::new();
if genotype.is_some() {
if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() {
for gt in genotypes.iter() {
for (chromosome_index, gt) in genotypes.iter().enumerate() {
if gt.is_ok() {
let (haplotype, phasing) = gt.unwrap();
let haplotype = haplotype.unwrap();
if haplotype != 0 && !seen_haplotypes.contains(&(haplotype as i32)) {
let alt_seq = alt_alleles[haplotype - 1];
let (allele, phasing) = gt.unwrap();
let phased = match phasing {
Phasing::Phased => 1,
Phasing::Unphased => 0,
};
let allele = allele.unwrap();
if allele != 0 {
let alt_seq = alt_alleles[allele - 1];
// TODO: new sequence may not be real and be <DEL> or some sort. Handle these.
let new_sequence_hash = models::Sequence::create(
let new_sequence_hash = Sequence::create(
conn,
"DNA".to_string(),
&alt_seq.to_string(),
true,
);
let sample_path_id = models::Path::get_or_create_sample_path(
let sample_bg_id = BlockGroup::get_or_create_sample_block_group(
conn,
collection_name,
&sample_names[sample_index],
&seq_name,
haplotype as i32,
);
let new_block_id = Block::create(
conn,
&new_sequence_hash,
sample_path_id,
sample_bg_id,
0,
alt_seq.len() as i32,
&"1".to_string(),
);
Path::insert_change(
println!("{sample_bg_id} {new_block_id:?} {chromosome_index} {phased} {allele}");
BlockGroup::insert_change(
conn,
sample_path_id,
sample_bg_id,
ref_start as i32,
ref_end as i32,
new_block_id.id,
chromosome_index as i32,
phased,
);
}
seen_haplotypes.insert(haplotype as i32);
}
}
}
Expand Down Expand Up @@ -220,7 +227,7 @@ mod tests {
);
update_with_vcf(&vcf_path.to_str().unwrap().to_string(), &collection, conn);
assert_eq!(
Path::sequence(conn, &collection, Some(&"foo".to_string()), "m123", 1),
BlockGroup::sequence(conn, &collection, Some(&"foo".to_string()), "m123"),
"ATCATCGATCGATCGATCGGGAACACACAGAGA"
);
}
Expand Down
Loading

0 comments on commit 229ebd4

Please sign in to comment.