Skip to content

Commit

Permalink
Merge pull request #13 from ginkgobioworks/changelog-experiment
Browse files Browse the repository at this point in the history
Changelog PoC
  • Loading branch information
Chris7 authored Aug 6, 2024
2 parents 0147a1c + 697a726 commit d217e9e
Show file tree
Hide file tree
Showing 7 changed files with 666 additions and 110 deletions.
54 changes: 54 additions & 0 deletions fixtures/general.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
##fileformat=VCFv4.1
##filedate=Tue Sep 4 13:12:57 2018
##reference=simple.fa
##contig=<ID=m123,length=34>
##phasing=none
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
##INFO=<ID=DPB,Number=1,Type=Float,Description="Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
##INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
##INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
##INFO=<ID=PRO,Number=1,Type=Float,Description="Reference allele observation count, with partial observations recorded fractionally">
##INFO=<ID=PAO,Number=A,Type=Float,Description="Alternate allele observations, with partial observations recorded fractionally">
##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
##INFO=<ID=PQR,Number=1,Type=Float,Description="Reference allele quality sum in phred for partial observations">
##INFO=<ID=PQA,Number=A,Type=Float,Description="Alternate allele quality sum in phred for partial observations">
##INFO=<ID=SRF,Number=1,Type=Integer,Description="Number of reference observations on the forward strand">
##INFO=<ID=SRR,Number=1,Type=Integer,Description="Number of reference observations on the reverse strand">
##INFO=<ID=SAF,Number=A,Type=Integer,Description="Number of alternate observations on the forward strand">
##INFO=<ID=SAR,Number=A,Type=Integer,Description="Number of alternate observations on the reverse strand">
##INFO=<ID=SRP,Number=1,Type=Float,Description="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality">
##INFO=<ID=SAP,Number=A,Type=Float,Description="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality">
##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
##INFO=<ID=ABP,Number=A,Type=Float,Description="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality">
##INFO=<ID=RUN,Number=A,Type=Integer,Description="Run length: the number of consecutive repeats of the alternate allele in the reference genome">
##INFO=<ID=RPP,Number=A,Type=Float,Description="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
##INFO=<ID=RPPR,Number=1,Type=Float,Description="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
##INFO=<ID=RPL,Number=A,Type=Float,Description="Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele">
##INFO=<ID=RPR,Number=A,Type=Float,Description="Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele">
##INFO=<ID=EPP,Number=A,Type=Float,Description="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
##INFO=<ID=EPPR,Number=1,Type=Float,Description="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
##INFO=<ID=DPRA,Number=A,Type=Float,Description="Alternate allele depth ratio. Ratio between depth in samples with each called alternate allele and those without.">
##INFO=<ID=ODDS,Number=1,Type=Float,Description="The log odds ratio of the best genotype combination to the second-best.">
##INFO=<ID=GTI,Number=1,Type=Integer,Description="Number of genotyping iterations required to reach convergence or bailout.">
##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
##INFO=<ID=CIGAR,Number=A,Type=String,Description="The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing. Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.">
##INFO=<ID=NUMALT,Number=1,Type=Integer,Description="Number of unique non-reference alleles in called genotypes at this position.">
##INFO=<ID=MEANALT,Number=A,Type=Float,Description="Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.">
##INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
##INFO=<ID=MQMR,Number=1,Type=Float,Description="Mean mapping quality of observed reference alleles">
##INFO=<ID=PAIRED,Number=A,Type=Float,Description="Proportion of observed alternate alleles which are supported by properly paired read fragments">
##INFO=<ID=PAIREDR,Number=1,Type=Float,Description="Proportion of observed reference alleles which are supported by properly paired read fragments">
##INFO=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum depth in gVCF output block.">
##INFO=<ID=END,Number=1,Type=Integer,Description="Last position (inclusive) in gVCF output record.">
##INFO=<ID=FUZZY,Number=1,Type=Integer,Description="repeat size at the breakpoint, only for INS and DEL">
##filter="QUAL > 1"
##FILTER=<ID=PASS,Description="All filters passed">
#CHROM POS ID REF ALT QUAL FILTER INFO
m123 3 . CGA CA 1611.92 . AB=0;ABP=0;AC=2;AF=1;AN=2;AO=53;CIGAR=1M1D1M;DP=56;DPB=38;DPRA=0;EPP=3.37904;EPPR=0;GTI=0;LEN=1;MEANALT=3;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=76.9802;PAIRED=1;PAIREDR=0;PAO=0;PQA=0;PQR=0;PRO=0;QA=1837;QR=0;RO=0;RPL=18;RPP=14.851;RPPR=0;RPR=35;RUN=1;SAF=27;SAP=3.05127;SAR=26;SRF=0;SRP=0;SRR=0;TYPE=del
m123 10 . TC TAGA 2216.6 . AB=0;ABP=0;AC=2;AF=1;AN=2;AO=74;CIGAR=1M2I1X;DP=76;DPB=154.5;DPRA=0;EPP=4.06669;EPPR=0;GTI=0;LEN=3;MEANALT=3;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=113.024;PAIRED=0.986486;PAIREDR=0;PAO=2.5;PQA=92.5;PQR=92.5;PRO=2.5;QA=2502;QR=0;RO=0;RPL=31;RPP=7.23587;RPPR=0;RPR=43;RUN=1;SAF=37;SAP=3.0103;SAR=37;SRF=0;SRP=0;SRR=0;TYPE=complex
16 changes: 15 additions & 1 deletion migrations/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,18 @@ CREATE TABLE path_blocks (
FOREIGN KEY(target_block_id) REFERENCES block(id),
FOREIGN KEY(path_id) REFERENCES path(id)
);
CREATE UNIQUE INDEX path_blocks_uidx ON path_blocks(path_id, source_block_id, target_block_id);
CREATE UNIQUE INDEX path_blocks_uidx ON path_blocks(path_id, source_block_id, target_block_id);

CREATE TABLE change_log (
hash TEXT PRIMARY KEY NOT NULL,
path_id INTEGER NOT NULL,
path_start INTEGER NOT NULL,
path_end INTEGER NOT NULL,
sequence_hash TEXT NOT NULL,
sequence_start INTEGER NOT NULL,
sequence_end INTEGER NOT NULL,
sequence_strand TEXT NOT NULL,
FOREIGN KEY(path_id) REFERENCES path(id),
FOREIGN KEY(sequence_hash) REFERENCES sequence(hash)
);
CREATE UNIQUE INDEX change_log_uidx ON change_log(hash);
104 changes: 104 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod migrations;
pub mod models;

use crate::migrations::run_migrations;
use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
use rusqlite::Connection;
use sha2::{Digest, Sha256};

Expand All @@ -30,6 +31,51 @@ pub fn calculate_hash(t: &str) -> String {
format!("{:x}", result)
}

pub struct Genotype {
pub allele: i32,
pub phasing: Phasing,
}

pub fn parse_genotype(gt: &str) -> Vec<Option<Genotype>> {
let mut genotypes = vec![];
let mut phase = match gt.contains("/") {
true => Phasing::Unphased,
false => Phasing::Phased,
};
for (index, entry) in gt.split_inclusive(|c| c == '|' || c == '/').enumerate() {
let mut allele;
let mut phasing = Phasing::Unphased;
if entry.ends_with(['/', '|']) {
let (allele_str, phasing_str) = entry.split_at(entry.len() - 1);
allele = allele_str;
phasing = match phasing_str == "|" {
true => Phasing::Phased,
false => Phasing::Unphased,
}
} else {
allele = entry;
}
if allele == "." {
genotypes.push(None);
} else {
genotypes.push(Some(Genotype {
allele: allele.parse::<i32>().unwrap(),
phasing: phase,
}));
}
// we're always 1 behind on phase, e.g. 0|1, the | is the phase of the next allele
phase = phasing;
}
genotypes
}

pub fn get_overlap(a: i32, b: i32, x: i32, y: i32) -> (bool, bool, bool) {
let contains_start = a <= x && x < b;
let contains_end = a < y && y < b;
let overlap = a < y && x < b;
(contains_start, contains_end, overlap)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -62,4 +108,62 @@ mod tests {
.unwrap();
assert_eq!(sequence_count, 0);
}

#[test]
fn parses_genotype() {
let genotypes = parse_genotype("1");
let genotype_1 = genotypes[0].as_ref().unwrap();
assert_eq!(genotype_1.allele, 1);
assert_eq!(genotype_1.phasing, Phasing::Phased);
let genotypes = parse_genotype("0|1");
let genotype_1 = genotypes[0].as_ref().unwrap();
let genotype_2 = genotypes[1].as_ref().unwrap();
assert_eq!(genotype_1.allele, 0);
assert_eq!(genotype_1.phasing, Phasing::Phased);
assert_eq!(genotype_2.allele, 1);
assert_eq!(genotype_2.phasing, Phasing::Phased);
let genotypes = parse_genotype("0/1");
let genotype_1 = genotypes[0].as_ref().unwrap();
let genotype_2 = genotypes[1].as_ref().unwrap();
assert_eq!(genotype_1.allele, 0);
assert_eq!(genotype_1.phasing, Phasing::Unphased);
assert_eq!(genotype_2.allele, 1);
assert_eq!(genotype_2.phasing, Phasing::Unphased);
let genotypes = parse_genotype("0/1|2");
let genotype_1 = genotypes[0].as_ref().unwrap();
let genotype_2 = genotypes[1].as_ref().unwrap();
let genotype_3 = genotypes[2].as_ref().unwrap();
assert_eq!(genotype_1.allele, 0);
assert_eq!(genotype_1.phasing, Phasing::Unphased);
assert_eq!(genotype_2.allele, 1);
assert_eq!(genotype_2.phasing, Phasing::Unphased);
assert_eq!(genotype_3.allele, 2);
assert_eq!(genotype_3.phasing, Phasing::Phased);
let genotypes = parse_genotype("2|1|2");
let genotype_1 = genotypes[0].as_ref().unwrap();
let genotype_2 = genotypes[1].as_ref().unwrap();
let genotype_3 = genotypes[2].as_ref().unwrap();
assert_eq!(genotype_1.allele, 2);
assert_eq!(genotype_1.phasing, Phasing::Phased);
assert_eq!(genotype_2.allele, 1);
assert_eq!(genotype_2.phasing, Phasing::Phased);
assert_eq!(genotype_3.allele, 2);
assert_eq!(genotype_3.phasing, Phasing::Phased);
let genotypes = parse_genotype("2|.|2");
let genotype_1 = genotypes[0].as_ref().unwrap();
let genotype_3 = genotypes[2].as_ref().unwrap();
assert_eq!(genotype_1.allele, 2);
assert_eq!(genotype_1.phasing, Phasing::Phased);
assert_eq!(genotype_3.allele, 2);
assert_eq!(genotype_3.phasing, Phasing::Phased);
assert!(genotypes[1].is_none());
}

#[test]
fn test_overlaps() {
assert_eq!(get_overlap(0, 10, 10, 10), (false, false, false));
assert_eq!(get_overlap(10, 20, 10, 20), (true, false, true));
assert_eq!(get_overlap(10, 20, 5, 15), (false, true, true));
assert_eq!(get_overlap(10, 20, 0, 10), (false, false, false));
}
}
Loading

0 comments on commit d217e9e

Please sign in to comment.