ginkgobioworks · Chris7 · Nov 5, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,6 +26,7 @@ serde_json = "1.0.128"
 sha2 = "0.10.8"
 tempfile = "3.12.0"
 tempdir = "0.3.7"
+interavl = "0.2.0"
 
 [dev-dependencies]
 cargo-llvm-cov = "0.6.13"
diff --git a/docs/figures/iterative_changes_f1.png b/docs/figures/iterative_changes_f1.png
diff --git a/docs/figures/iterative_changes_f2.png b/docs/figures/iterative_changes_f2.png
diff --git a/docs/iterative_changes.md b/docs/iterative_changes.md
@@ -0,0 +1,76 @@
+# Iterative Changes
+
+A core capability of Gen is the ability to model iterative engineering to a cell line. This represents multiple rounds
+of engineering. As an example, suppose we insert two landing pads into a genome for general use. Iterative engineering
+allows us to work off the cell line assuming these changes are baked into.
+
+The challenge in modeling iterative engineering is the frame of reference for changes. In the above example, inserting
+a landing pad will change the reference coordinates of the resulting genome. Thus, if we are working with 2 landing pads,
+we may want to be able to address them in either the reference frame of the initial reference genome, or in the reference
+frame of the changed genome. To illustrate this, here is an example of 2 rounds of engineering from a base sequence.
+
+First we import a sequence file
+
+```bash
+gen --db simple.db import --fasta simple.fa --name simple_example
+```
+
+This creates a collection, `simple_example` with the simple fasta file serving as the reference genome. This could just
+as easily be Hg38, etc.
+
+Next, we use a vcf file to model changes. We want to delete two regions of the genome:
+
+```text
+##fileformat=VCFv4.1
+##filedate=Tue Sep  4 13:12:57 2018
+##reference=simple.fa
+##contig=<ID=m123,length=34>
+##phasing=none
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	f1
+m123	3	.	CGA	C	1611.92	.		GT	1
+m123	16	.	GAT	G	1611.92	.		GT	1
+```
+
+```bash
+gen --db simple.db update --vcf round1.vcf --name simple_example
+```
+This creates a new sample, `f1` with the above changes baked into its genome. This example is a haploid such as e. coli
+where the genotype is always homozygous. Graphically, the genome now appears as such:
+
+![F1 Genome](figures/iterative_changes_f1.png)
+
+Next, we want to make more changes -- snps, insertions, and deletions based on this changed genome.
+
+```vcf
+##fileformat=VCFv4.1
+##filedate=Tue Sep  4 13:12:57 2018
+##reference=simple.fa
+##contig=<ID=m123,length=34>
+##phasing=none
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	f2
+m123	5	.	C	G	1611.92	.		GT	1
+m123	17	.	GG	GAATCAG	1611.92	.		GT	1
+m123	27	.	GA	G	1611.92	.		GT	1
+```
+
+```bash
+gen --db simple.db update --vcf round2.vcf --name simple_example --coordinate-frame f1
+```
+
+This command is very similar, with the exception that we are able to define which reference frame to use for changes. Here,
+we specify the reference frame of sample `f1`. This operation creates a new sample, `f2`, with the reference frame of
+`f1` for coordinates. If no coordinate frame is provided, the reference genome's frame of reference is used. The resulting
+genome appears as follows:
+
+![F2 Genome](figures/iterative_changes_f2.png)
+
+## Caveats
+
+When updating a genome based on positions, only non-ambiguous changes are permitted. For example, if the above vcf 
+contained a heterozygous insertion, it would create an ambiguity in positions downstream of the insertion. Thus,
+these changes are not permitted. However, this format is very amenable for simpler organisms such as e. coli.
+
+For changes where positions are ambiguous, the following approaches may be taken to model changes:
+* links here
diff --git a/fixtures/simple_iterative_engineering_1.vcf b/fixtures/simple_iterative_engineering_1.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.1
+##filedate=Tue Sep  4 13:12:57 2018
+##reference=simple.fa
+##contig=<ID=m123,length=34>
+##phasing=none
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	f1
+m123	3	.	CGA	C	1611.92	.		GT	1
+m123	16	.	GAT	G	1611.92	.		GT	1
diff --git a/fixtures/simple_iterative_engineering_2.vcf b/fixtures/simple_iterative_engineering_2.vcf
@@ -0,0 +1,10 @@
+##fileformat=VCFv4.1
+##filedate=Tue Sep  4 13:12:57 2018
+##reference=simple.fa
+##contig=<ID=m123,length=34>
+##phasing=none
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	f2
+m123	5	.	C	G	1611.92	.		GT	1
+m123	17	.	GG	GAATCAG	1611.92	.		GT	1
+m123	27	.	GA	G	1611.92	.		GT	1
diff --git a/fixtures/simple_iterative_engineering_3.vcf b/fixtures/simple_iterative_engineering_3.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.1
+##filedate=Tue Sep  4 13:12:57 2018
+##reference=simple.fa
+##contig=<ID=m123,length=34>
+##phasing=none
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	f3
+m123	4	.	T	G	1611.92	.		GT	1
+m123	15	.	CGGAA	C	1611.92	.		GT	1
diff --git a/fixtures/simple_overlap.vcf b/fixtures/simple_overlap.vcf
@@ -3,62 +3,9 @@
 ##reference=simple.fa
 ##contig=<ID=m123,length=34>
 ##phasing=none
-##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
-##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
-##INFO=<ID=DPB,Number=1,Type=Float,Description="Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype">
-##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
-##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
-##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
-##INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
-##INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
-##INFO=<ID=PRO,Number=1,Type=Float,Description="Reference allele observation count, with partial observations recorded fractionally">
-##INFO=<ID=PAO,Number=A,Type=Float,Description="Alternate allele observations, with partial observations recorded fractionally">
-##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
-##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
-##INFO=<ID=PQR,Number=1,Type=Float,Description="Reference allele quality sum in phred for partial observations">
-##INFO=<ID=PQA,Number=A,Type=Float,Description="Alternate allele quality sum in phred for partial observations">
-##INFO=<ID=SRF,Number=1,Type=Integer,Description="Number of reference observations on the forward strand">
-##INFO=<ID=SRR,Number=1,Type=Integer,Description="Number of reference observations on the reverse strand">
-##INFO=<ID=SAF,Number=A,Type=Integer,Description="Number of alternate observations on the forward strand">
-##INFO=<ID=SAR,Number=A,Type=Integer,Description="Number of alternate observations on the reverse strand">
-##INFO=<ID=SRP,Number=1,Type=Float,Description="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=SAP,Number=A,Type=Float,Description="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
-##INFO=<ID=ABP,Number=A,Type=Float,Description="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=RUN,Number=A,Type=Integer,Description="Run length: the number of consecutive repeats of the alternate allele in the reference genome">
-##INFO=<ID=RPP,Number=A,Type=Float,Description="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=RPPR,Number=1,Type=Float,Description="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=RPL,Number=A,Type=Float,Description="Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele">
-##INFO=<ID=RPR,Number=A,Type=Float,Description="Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele">
-##INFO=<ID=EPP,Number=A,Type=Float,Description="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=EPPR,Number=1,Type=Float,Description="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=DPRA,Number=A,Type=Float,Description="Alternate allele depth ratio.  Ratio between depth in samples with each called alternate allele and those without.">
-##INFO=<ID=ODDS,Number=1,Type=Float,Description="The log odds ratio of the best genotype combination to the second-best.">
-##INFO=<ID=GTI,Number=1,Type=Integer,Description="Number of genotyping iterations required to reach convergence or bailout.">
-##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
-##INFO=<ID=CIGAR,Number=A,Type=String,Description="The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing.  Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.">
-##INFO=<ID=NUMALT,Number=1,Type=Integer,Description="Number of unique non-reference alleles in called genotypes at this position.">
-##INFO=<ID=MEANALT,Number=A,Type=Float,Description="Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.">
-##INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
-##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
-##INFO=<ID=MQMR,Number=1,Type=Float,Description="Mean mapping quality of observed reference alleles">
-##INFO=<ID=PAIRED,Number=A,Type=Float,Description="Proportion of observed alternate alleles which are supported by properly paired read fragments">
-##INFO=<ID=PAIREDR,Number=1,Type=Float,Description="Proportion of observed reference alleles which are supported by properly paired read fragments">
-##INFO=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum depth in gVCF output block.">
-##INFO=<ID=END,Number=1,Type=Integer,Description="Last position (inclusive) in gVCF output record.">
-##INFO=<ID=FUZZY,Number=1,Type=Integer,Description="repeat size at the breakpoint, only for INS and DEL">
 ##filter="QUAL > 1"
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
-##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
-##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
-##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Number of observation for each allele">
-##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
-##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
-##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
-##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
-##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum depth in gVCF output block.">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	unknown	G1	foo
-m123	3	.	CGA	CA	1611.92	.	AB=0;ABP=0;AC=2;AF=1;AN=2;AO=53;CIGAR=1M1D1M;DP=56;DPB=38;DPRA=0;EPP=3.37904;EPPR=0;GTI=0;LEN=1;MEANALT=3;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=76.9802;PAIRED=1;PAIREDR=0;PAO=0;PQA=0;PQR=0;PRO=0;QA=1837;QR=0;RO=0;RPL=18;RPP=14.851;RPPR=0;RPR=35;RUN=1;SAF=27;SAP=3.05127;SAR=26;SRF=0;SRP=0;SRR=0;TYPE=del	GT:DP:AD:RO:QR:AO:QA:GL	1/1:56:0,53:0:0:53:1837:-165.301,-15.9546,0	.	1/1:56:0,53:0:0:53:1837:-165.301,-15.9546,0
-m123	10	.	TC	*,TAGA	2216.6	.	AB=0;ABP=0;AC=2;AF=1;AN=2;AO=74;CIGAR=1M2I1X;DP=76;DPB=154.5;DPRA=0;EPP=4.06669;EPPR=0;GTI=0;LEN=3;MEANALT=3;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=113.024;PAIRED=0.986486;PAIREDR=0;PAO=2.5;PQA=92.5;PQR=92.5;PRO=2.5;QA=2502;QR=0;RO=0;RPL=31;RPP=7.23587;RPPR=0;RPR=43;RUN=1;SAF=37;SAP=3.0103;SAR=37;SRF=0;SRP=0;SRR=0;TYPE=complex	GT:DP:AD:RO:QR:AO:QA:GL	1/1:76:0,74:0:0:74:2502:-225.285,-25.2865,0	.	0/0:56:0,53:0:0:53:1837:-165.301,-15.9546,0
+m123	3	.	CGA	CA	1611.92	.	.	GT	1/1	.	1/1
+m123	10	.	TC	*,TAGA	2216.6	.	.	GT	1/1	.	0/0
diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs
@@ -61,7 +61,9 @@ pub fn export_gfa(
     let boundary_edges = Edge::boundary_edges_from_sequences(&blocks);
     edges.extend(boundary_edges.clone());
 
-    let (graph, edges_by_node_pair) = Edge::build_graph(&edges, &blocks);
+    let (mut graph, edges_by_node_pair) = Edge::build_graph(&edges, &blocks);
+
+    BlockGroup::prune_graph(&mut graph);
 
     let file = File::create(filename).unwrap();
     let mut writer = BufWriter::new(file);
@@ -342,7 +344,7 @@ mod tests {
             &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id],
         );
 
-        let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id);
+        let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id, false);
 
         let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
         let mut gfa_path = PathBuf::from(temp_dir.path());
@@ -355,7 +357,7 @@ mod tests {
         let block_group2 = Collection::get_block_groups(&conn, "test collection 2")
             .pop()
             .unwrap();
-        let all_sequences2 = BlockGroup::get_all_sequences(&conn, block_group2.id);
+        let all_sequences2 = BlockGroup::get_all_sequences(&conn, block_group2.id, false);
 
         assert_eq!(all_sequences, all_sequences2);
 
@@ -374,7 +376,7 @@ mod tests {
         import_gfa(&gfa_path, &collection_name, conn);
 
         let block_group_id = BlockGroup::get_id(conn, &collection_name, None, "");
-        let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id);
+        let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id, false);
 
         let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
         let mut gfa_path = PathBuf::from(temp_dir.path());
@@ -386,7 +388,7 @@ mod tests {
         let block_group2 = Collection::get_block_groups(conn, "test collection 2")
             .pop()
             .unwrap();
-        let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id);
+        let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id, false);
 
         assert_eq!(all_sequences, all_sequences2);
     }
@@ -401,7 +403,7 @@ mod tests {
         import_gfa(&gfa_path, &collection_name, conn);
 
         let block_group_id = BlockGroup::get_id(conn, &collection_name, None, "");
-        let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id);
+        let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id, false);
 
         let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
         let mut gfa_path = PathBuf::from(temp_dir.path());
@@ -413,7 +415,7 @@ mod tests {
         let block_group2 = Collection::get_block_groups(conn, "anderson promoters 2")
             .pop()
             .unwrap();
-        let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id);
+        let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id, false);
 
         assert_eq!(all_sequences, all_sequences2);
     }
@@ -428,7 +430,7 @@ mod tests {
         import_gfa(&gfa_path, &collection_name, conn);
 
         let block_group_id = BlockGroup::get_id(conn, &collection_name, None, "");
-        let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id);
+        let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id, false);
 
         let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
         let mut gfa_path = PathBuf::from(temp_dir.path());
@@ -440,7 +442,7 @@ mod tests {
         let block_group2 = Collection::get_block_groups(conn, "test collection 2")
             .pop()
             .unwrap();
-        let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id);
+        let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id, false);
 
         assert_eq!(all_sequences, all_sequences2);
     }

diff --git a/src/graph.rs b/src/graph.rs
@@ -1,8 +1,10 @@
+use std::collections::{HashSet, VecDeque};
+use std::fmt::Debug;
 use std::hash::Hash;
 use std::iter::from_fn;
 
 use crate::models::strand::Strand;
-use petgraph::visit::{IntoNeighborsDirected, NodeCount};
+use petgraph::visit::{GraphRef, IntoNeighbors, IntoNeighborsDirected, NodeCount};
 use petgraph::Direction;
 
 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
@@ -65,6 +67,28 @@ where
     })
 }
 
+pub fn all_reachable_nodes<G>(graph: G, nodes: &[G::NodeId]) -> HashSet<G::NodeId>
+where
+    G: GraphRef + IntoNeighbors,
+    G::NodeId: Eq + Hash + Debug,
+{
+    let mut stack = VecDeque::new();
+    let mut reachable = HashSet::new();
+    for node in nodes.iter() {
+        stack.push_front(*node);
+        reachable.insert(*node);
+        while let Some(nx) = stack.pop_front() {
+            for succ in graph.neighbors(nx) {
+                if !reachable.contains(&succ) {
+                    reachable.insert(succ);
+                    stack.push_back(succ);
+                }
+            }
+        }
+    }
+    reachable
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -227,4 +251,48 @@ mod tests {
             ])
         );
     }
+
+    #[test]
+    fn test_finds_all_reachable_nodes() {
+        //
+        //   1 -> 2 -> 3 -> 4 -> 5
+        //           /
+        //   6 -> 7
+        //
+        let mut graph: DiGraphMap<i64, ()> = DiGraphMap::new();
+        graph.add_node(1);
+        graph.add_node(2);
+        graph.add_node(3);
+        graph.add_node(4);
+        graph.add_node(5);
+        graph.add_node(6);
+        graph.add_node(7);
+
+        graph.add_edge(1, 2, ());
+        graph.add_edge(2, 3, ());
+        graph.add_edge(3, 4, ());
+        graph.add_edge(4, 5, ());
+        graph.add_edge(6, 7, ());
+        graph.add_edge(7, 3, ());
+
+        assert_eq!(
+            all_reachable_nodes(&graph, &[1]),
+            HashSet::from_iter(vec![1, 2, 3, 4, 5])
+        );
+
+        assert_eq!(
+            all_reachable_nodes(&graph, &[1, 6]),
+            HashSet::from_iter(vec![1, 2, 3, 4, 5, 6, 7])
+        );
+
+        assert_eq!(
+            all_reachable_nodes(&graph, &[3]),
+            HashSet::from_iter(vec![3, 4, 5])
+        );
+
+        assert_eq!(
+            all_reachable_nodes(&graph, &[5]),
+            HashSet::from_iter(vec![5])
+        );
+    }
 }