From d7aa187ef0da901ec185c2c4b91c19e48043cbff Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 11:40:42 -0700 Subject: [PATCH 01/23] added a prelim test --- vdj_ann/src/transcript.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 2fa9f708..ac65df01 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -343,3 +343,31 @@ pub fn junction_supp_core( idi = idj; } } + + +#[cfg(test)] +mod tests { + use super::*; + use crate::refx; + + #[test] + fn test_is_valid() { + use refx::RefData; + use debruijn::dna_string::DnaString; + + let refdata = RefData::from_fasta(&String::from( + "test/inputs/test_no_internal_soft_clipping_ref.fa", + )); + + let b = DnaString::from_dna_string("AAACGT"); + let ann = [(1,2,0,4,5)]; + let mut log: Vec = vec![]; + + let aaa = is_valid(&b, &refdata, &ann, false, &mut log, None); + + assert!(!aaa); + + + + } +} From ca66019fe1d0ea2d29028cf8f5db712d06442ec5 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 14:00:57 -0700 Subject: [PATCH 02/23] added telemetry for unproductive reasoning --- vdj_ann/src/annotate.rs | 2 +- vdj_ann/src/transcript.rs | 36 ++++++++++++++++++++++++++---------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index dee29ef6..1fd22353 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -3179,7 +3179,7 @@ impl ContigAnnotation { let mut ann = Vec::<(i32, i32, i32, i32, i32)>::new(); annotate_seq(b, refdata, &mut ann, true, false, true); let mut log = Vec::::new(); - let productive = is_valid(b, refdata, &ann, false, &mut log, is_gd); + let productive = is_valid(b, refdata, &ann, false, &mut log, is_gd).0; ContigAnnotation::from_annotate_seq( b, q, diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index ac65df01..263996db 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -16,6 +16,13 @@ use vector_utils::{lower_bound1_3, unique_sort}; // TEST FOR VALID VDJ SEQUENCE // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ +#[derive(Debug)] +pub enum UnproductiveContigCause{ + NoCdr3, + Misordered, + NotFull, +} + pub fn is_valid( b: &DnaString, refdata: &RefData, @@ -23,11 +30,12 @@ pub fn is_valid( logme: bool, log: &mut Vec, is_gd: Option, -) -> bool { +) -> (bool, Vec){ // Unwrap gamma/delta mode flag let gd_mode = is_gd.unwrap_or(false); let refs = &refdata.refs; let rheaders = &refdata.rheaders; + let mut ret_vec = Vec::new(); for pass in 0..2 { let mut m = "A"; if pass == 1 { @@ -139,7 +147,7 @@ pub fn is_valid( if logme { fwriteln!(log, "did not find CDR3"); } - return false; + return (false, vec![UnproductiveContigCause::NoCdr3]); } let mut too_large = false; const MIN_DELTA: i32 = -25; @@ -179,15 +187,17 @@ pub fn is_valid( } if misordered && logme { fwriteln!(log, "misordered"); + ret_vec.push(UnproductiveContigCause::Misordered); } if !full && logme { fwriteln!(log, "not full"); + ret_vec.push(UnproductiveContigCause::NotFull); } if full && !too_large && !misordered { - return true; + return (true, vec![]); } } - false + (false, ret_vec) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ @@ -354,18 +364,24 @@ mod tests { fn test_is_valid() { use refx::RefData; use debruijn::dna_string::DnaString; - + // TODO use a smaller ref checked in to the repo let refdata = RefData::from_fasta(&String::from( - "test/inputs/test_no_internal_soft_clipping_ref.fa", + "/mnt/opt/refdata_cellranger/vdj/vdj_GRCm38_alts_ensembl-7.1.0/fasta/regions.fa", )); - let b = DnaString::from_dna_string("AAACGT"); - let ann = [(1,2,0,4,5)]; let mut log: Vec = vec![]; - let aaa = is_valid(&b, &refdata, &ann, false, &mut log, None); + // // NoCdr3 + // let b = DnaString::from_dna_string("ACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG"); + // let ann = [(54, 148, 139, 0, 16), (205, 246, 139, 148, 58), (418, 48, 29, 0, 2)]; + + let b = DnaString::from_dna_string("GACACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); + let ann = [(57, 148, 139, 0, 16), (208, 246, 139, 148, 58), (421, 48, 29, 0, 2), (469, 211, 31, 0, 0)]; + - assert!(!aaa); + let aaa = is_valid(&b, &refdata, &ann, false, &mut log, None); + println!("{:?}", aaa); + assert!(1 == 2); From 86c8f20cf124ad5269a1d3796ce11b578ff0598e Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 15:12:08 -0700 Subject: [PATCH 03/23] added tooLarge --- vdj_ann/src/transcript.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 263996db..8ed2ce1b 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -21,6 +21,7 @@ pub enum UnproductiveContigCause{ NoCdr3, Misordered, NotFull, + TooLarge, } pub fn is_valid( @@ -189,6 +190,10 @@ pub fn is_valid( fwriteln!(log, "misordered"); ret_vec.push(UnproductiveContigCause::Misordered); } + if too_large && logme { + fwriteln!(log, "too large"); + ret_vec.push(UnproductiveContigCause::TooLarge); + } if !full && logme { fwriteln!(log, "not full"); ret_vec.push(UnproductiveContigCause::NotFull); From b5b1c0126cba5ccecfabca61920bc02d04137ea0 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 15:22:46 -0700 Subject: [PATCH 04/23] added prints --- vdj_ann/src/transcript.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 8ed2ce1b..0074fcd6 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -189,14 +189,17 @@ pub fn is_valid( if misordered && logme { fwriteln!(log, "misordered"); ret_vec.push(UnproductiveContigCause::Misordered); + println!(">> Misordered"); } if too_large && logme { fwriteln!(log, "too large"); ret_vec.push(UnproductiveContigCause::TooLarge); + println!(">> TooLarge"); } if !full && logme { fwriteln!(log, "not full"); ret_vec.push(UnproductiveContigCause::NotFull); + println!(">> NotFull"); } if full && !too_large && !misordered { return (true, vec![]); From cadb7263223d4a3d5bb5ed6e0c691d062ddf56d1 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 15:34:07 -0700 Subject: [PATCH 05/23] updated telem --- vdj_ann/src/transcript.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 0074fcd6..807581cf 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -190,16 +190,19 @@ pub fn is_valid( fwriteln!(log, "misordered"); ret_vec.push(UnproductiveContigCause::Misordered); println!(">> Misordered"); + (false, vec![UnproductiveContigCause::Misordered]) } if too_large && logme { fwriteln!(log, "too large"); ret_vec.push(UnproductiveContigCause::TooLarge); println!(">> TooLarge"); + (false, vec![UnproductiveContigCause::TooLarge]) } if !full && logme { fwriteln!(log, "not full"); ret_vec.push(UnproductiveContigCause::NotFull); println!(">> NotFull"); + (false, vec![UnproductiveContigCause::NotFull]) } if full && !too_large && !misordered { return (true, vec![]); From 6ab352a8164222a3a0ab1a0661c8905885902d14 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 15:35:15 -0700 Subject: [PATCH 06/23] updated telem --- vdj_ann/src/transcript.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 807581cf..e260be18 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -190,19 +190,19 @@ pub fn is_valid( fwriteln!(log, "misordered"); ret_vec.push(UnproductiveContigCause::Misordered); println!(">> Misordered"); - (false, vec![UnproductiveContigCause::Misordered]) + return (false, vec![UnproductiveContigCause::Misordered]); } if too_large && logme { fwriteln!(log, "too large"); ret_vec.push(UnproductiveContigCause::TooLarge); println!(">> TooLarge"); - (false, vec![UnproductiveContigCause::TooLarge]) + return (false, vec![UnproductiveContigCause::TooLarge]); } if !full && logme { fwriteln!(log, "not full"); ret_vec.push(UnproductiveContigCause::NotFull); println!(">> NotFull"); - (false, vec![UnproductiveContigCause::NotFull]) + return (false, vec![UnproductiveContigCause::NotFull]); } if full && !too_large && !misordered { return (true, vec![]); From ceb0c14180acf730e9d09fc4c2194cf79d690b9e Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 15:57:43 -0700 Subject: [PATCH 07/23] update --- vdj_ann/src/transcript.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index e260be18..8e95b6c1 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -22,6 +22,7 @@ pub enum UnproductiveContigCause{ Misordered, NotFull, TooLarge, + ANOTHER, } pub fn is_valid( @@ -208,7 +209,7 @@ pub fn is_valid( return (true, vec![]); } } - (false, ret_vec) + (false, vec![UnproductiveContigCause::ANOTHER]) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ From 42d6f935de5350eebbf8426f8a564b2f259eaa66 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 16:06:41 -0700 Subject: [PATCH 08/23] update --- vdj_ann/src/transcript.rs | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 8e95b6c1..86242e6e 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -187,29 +187,35 @@ pub fn is_valid( } } } - if misordered && logme { - fwriteln!(log, "misordered"); + if misordered { + if logme { + fwriteln!(log, "misordered"); + } ret_vec.push(UnproductiveContigCause::Misordered); - println!(">> Misordered"); - return (false, vec![UnproductiveContigCause::Misordered]); + // println!(">> Misordered"); + // return (false, vec![UnproductiveContigCause::Misordered]); } - if too_large && logme { - fwriteln!(log, "too large"); + if too_large { + if logme { + fwriteln!(log, "too large"); + } ret_vec.push(UnproductiveContigCause::TooLarge); - println!(">> TooLarge"); - return (false, vec![UnproductiveContigCause::TooLarge]); + // println!(">> TooLarge"); + // return (false, vec![UnproductiveContigCause::TooLarge]); } - if !full && logme { - fwriteln!(log, "not full"); + if !full { + if logme { + fwriteln!(log, "not full"); + } ret_vec.push(UnproductiveContigCause::NotFull); - println!(">> NotFull"); - return (false, vec![UnproductiveContigCause::NotFull]); + // println!(">> NotFull"); + // return (false, vec![UnproductiveContigCause::NotFull]); } if full && !too_large && !misordered { return (true, vec![]); } } - (false, vec![UnproductiveContigCause::ANOTHER]) + (false, ret_vec) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ From 780cb67351f7f2be39decc9306b504fb46ca7f5b Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 16:37:23 -0700 Subject: [PATCH 09/23] added all failing tests --- vdj_ann/src/transcript.rs | 79 ++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 86242e6e..88205082 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -5,24 +5,26 @@ use crate::annotate::get_cdr3_using_ann; use crate::refx::RefData; use amino::{have_start, have_stop}; -use debruijn::{dna_string::DnaString, kmer::Kmer20, Mer, Vmer}; +use debruijn::dna_string::DnaString; +use debruijn::kmer::Kmer20; +use debruijn::{Mer, Vmer}; use hyperbase::Hyper; use io_utils::fwriteln; use kmer_lookup::make_kmer_lookup_20_single; -use std::{cmp::max, io::prelude::*}; +use std::cmp::max; +use std::io::prelude::*; use vector_utils::{lower_bound1_3, unique_sort}; // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ // TEST FOR VALID VDJ SEQUENCE // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -#[derive(Debug)] -pub enum UnproductiveContigCause{ +#[derive(Debug, PartialEq, Ord, PartialOrd, Eq)] +pub enum UnproductiveContigCause { NoCdr3, Misordered, NotFull, TooLarge, - ANOTHER, } pub fn is_valid( @@ -32,7 +34,7 @@ pub fn is_valid( logme: bool, log: &mut Vec, is_gd: Option, -) -> (bool, Vec){ +) -> (bool, Vec) { // Unwrap gamma/delta mode flag let gd_mode = is_gd.unwrap_or(false); let refs = &refdata.refs; @@ -215,6 +217,8 @@ pub fn is_valid( return (true, vec![]); } } + ret_vec.sort_unstable(); + ret_vec.dedup(); (false, ret_vec) } @@ -372,7 +376,6 @@ pub fn junction_supp_core( } } - #[cfg(test)] mod tests { use super::*; @@ -380,8 +383,8 @@ mod tests { #[test] fn test_is_valid() { - use refx::RefData; use debruijn::dna_string::DnaString; + use refx::RefData; // TODO use a smaller ref checked in to the repo let refdata = RefData::from_fasta(&String::from( "/mnt/opt/refdata_cellranger/vdj/vdj_GRCm38_alts_ensembl-7.1.0/fasta/regions.fa", @@ -389,19 +392,59 @@ mod tests { let mut log: Vec = vec![]; - // // NoCdr3 - // let b = DnaString::from_dna_string("ACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG"); - // let ann = [(54, 148, 139, 0, 16), (205, 246, 139, 148, 58), (418, 48, 29, 0, 2)]; - - let b = DnaString::from_dna_string("GACACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); - let ann = [(57, 148, 139, 0, 16), (208, 246, 139, 148, 58), (421, 48, 29, 0, 2), (469, 211, 31, 0, 0)]; - + // NoCdr3 + let b = DnaString::from_dna_string("ACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG"); + let ann = [ + (54, 148, 139, 0, 16), + (205, 246, 139, 148, 58), + (418, 48, 29, 0, 2), + ]; + let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + assert!(!return_value.0); + assert!(return_value + .1 + .iter() + .all(|item| vec![UnproductiveContigCause::NoCdr3].contains(item))); - let aaa = is_valid(&b, &refdata, &ann, false, &mut log, None); - println!("{:?}", aaa); - assert!(1 == 2); + // NotFull + let b = DnaString::from_dna_string("GAACACATGCCCAATGTCCTCTCCACAGACACTGAACACACTGACTCCAACCATGGGGTGGAGTCTGGATCTTTTTCTTCCTCCTGTCAGGAACTGCAGGTGTCCACTCTGAGGTCCAGCTGCAACAGTCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGCAAGGCTTCTGGCTACACATTCACTGACTACTACATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGACTTGTTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCGGTCTATTACTGTGCAAGAAGGGCTAGGGTAACTGGGATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); + let ann = [(64, 340, 46, 11, 11)]; + let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + assert!(!return_value.0); + assert!(return_value + .1 + .iter() + .all(|item| vec![UnproductiveContigCause::NotFull].contains(item))); + // [NotFull, TooLarge] + let ann = [ + (64, 340, 46, 11, 11), + (416, 54, 30, 0, 4), + (470, 211, 31, 0, 0), + ]; + let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + assert!(!return_value.0); + assert!(return_value.1.iter().all(|item| vec![ + UnproductiveContigCause::NotFull, + UnproductiveContigCause::TooLarge + ] + .contains(item))); + // [Misordered, NotFull, TooLarge] + let ann = [ + (416, 54, 30, 0, 4), + (64, 340, 46, 11, 11), + (470, 211, 31, 0, 0), + ]; + let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + assert!(!return_value.0); + assert!(return_value.1.iter().all(|item| vec![ + UnproductiveContigCause::NotFull, + UnproductiveContigCause::TooLarge, + UnproductiveContigCause::Misordered + ] + .contains(item))); + assert!(1 == 2); } } From a6193cc81cfc19b38a4cda6ae9f52f3d1e295331 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 23 Aug 2023 16:39:08 -0700 Subject: [PATCH 10/23] update telem --- vdj_ann/src/transcript.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 88205082..92c0fd40 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -52,11 +52,13 @@ pub fn is_valid( let mut last_jstop: i32 = -1; let mut last_jstop_len: i32 = -1; let mut igh = false; + println!("--> {:?}", b); for j in 0..ann.len() { let l = ann[j].0 as usize; let len = ann[j].1 as usize; let t = ann[j].2 as usize; let p = ann[j].3 as usize; + println!("------> {:?}", rheaders[t]); if rheaders[t].contains("IGH") { igh = true; } From 91cabdd95b6d432dc03d7fc6ed663b22c38d12b7 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 09:50:31 -0700 Subject: [PATCH 11/23] using a smaller ref for tests --- vdj_ann/src/transcript.rs | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 92c0fd40..48a79d1a 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -52,13 +52,11 @@ pub fn is_valid( let mut last_jstop: i32 = -1; let mut last_jstop_len: i32 = -1; let mut igh = false; - println!("--> {:?}", b); for j in 0..ann.len() { let l = ann[j].0 as usize; let len = ann[j].1 as usize; let t = ann[j].2 as usize; let p = ann[j].3 as usize; - println!("------> {:?}", rheaders[t]); if rheaders[t].contains("IGH") { igh = true; } @@ -196,24 +194,18 @@ pub fn is_valid( fwriteln!(log, "misordered"); } ret_vec.push(UnproductiveContigCause::Misordered); - // println!(">> Misordered"); - // return (false, vec![UnproductiveContigCause::Misordered]); } if too_large { if logme { fwriteln!(log, "too large"); } ret_vec.push(UnproductiveContigCause::TooLarge); - // println!(">> TooLarge"); - // return (false, vec![UnproductiveContigCause::TooLarge]); } if !full { if logme { fwriteln!(log, "not full"); } ret_vec.push(UnproductiveContigCause::NotFull); - // println!(">> NotFull"); - // return (false, vec![UnproductiveContigCause::NotFull]); } if full && !too_large && !misordered { return (true, vec![]); @@ -389,7 +381,7 @@ mod tests { use refx::RefData; // TODO use a smaller ref checked in to the repo let refdata = RefData::from_fasta(&String::from( - "/mnt/opt/refdata_cellranger/vdj/vdj_GRCm38_alts_ensembl-7.1.0/fasta/regions.fa", + "/mnt/home/nima.mousavi/yard/flex/runs/20230823_wasted_data_contig/ref.fa", )); let mut log: Vec = vec![]; @@ -397,9 +389,9 @@ mod tests { // NoCdr3 let b = DnaString::from_dna_string("ACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG"); let ann = [ - (54, 148, 139, 0, 16), - (205, 246, 139, 148, 58), - (418, 48, 29, 0, 2), + (54, 148, 0, 0, 16), + (205, 246, 0, 148, 58), + (418, 48, 1, 0, 2), ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.0); @@ -410,7 +402,7 @@ mod tests { // NotFull let b = DnaString::from_dna_string("GAACACATGCCCAATGTCCTCTCCACAGACACTGAACACACTGACTCCAACCATGGGGTGGAGTCTGGATCTTTTTCTTCCTCCTGTCAGGAACTGCAGGTGTCCACTCTGAGGTCCAGCTGCAACAGTCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGCAAGGCTTCTGGCTACACATTCACTGACTACTACATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGACTTGTTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCGGTCTATTACTGTGCAAGAAGGGCTAGGGTAACTGGGATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); - let ann = [(64, 340, 46, 11, 11)]; + let ann = [(64, 340, 2, 11, 11)]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.0); assert!(return_value @@ -420,9 +412,9 @@ mod tests { // [NotFull, TooLarge] let ann = [ - (64, 340, 46, 11, 11), - (416, 54, 30, 0, 4), - (470, 211, 31, 0, 0), + (64, 340, 2, 11, 11), + (416, 54, 3, 0, 4), + (470, 211, 4, 0, 0), ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.0); @@ -434,9 +426,9 @@ mod tests { // [Misordered, NotFull, TooLarge] let ann = [ - (416, 54, 30, 0, 4), - (64, 340, 46, 11, 11), - (470, 211, 31, 0, 0), + (416, 54, 3, 0, 4), + (64, 340, 2, 11, 11), + (470, 211, 4, 0, 0), ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.0); From 762c3782cfdac9d2583ebb374fb133251b9bad64 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 09:58:15 -0700 Subject: [PATCH 12/23] added a productive case to the test --- vdj_ann/src/transcript.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 48a79d1a..9d710c29 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -439,6 +439,12 @@ mod tests { ] .contains(item))); + let b = DnaString::from_dna_string("GGACCAAAATTCAAAGACAAAATGCATTGTCAAGTGCAGATTTTCAGCTTCCTGCTAATCAGTGCCTCAGTCATAATGTCCAGAGGACAAATTGTTCTCACCCAGTCTCCAGCAATCATGTCTGCATCTCCAGGGGAGAAGGTCACCATAACCTGCAGTGCCAGCTCAAGTGTAAGTTACATGCACTGGTTCCAGCAGAAGCCAGGCACTTCTCCCAAACTCTGGATTTATAGCACATCCAACCTGGCTTCTGGAGTCCCTGCTCGCTTCAGTGGCAGTGGATCTGGGACCTCTTACTCTCTCACAATCAGCCGAATGGAGGCTGAAGATGCTGCCACTTATTACTGCCAGCAAAGGAGTAGTTACCCGCTCACGTTCGGTGCTGGGACCAAGCTGGAGCTGAAACGGGCTGATGCTGCACCAACTGTATCCATCTTCCCACCATCCAGTGAGCAGTTAACATCTGGAGGTGCCTCAGTCGTGTGCTTC"); + let ann = [(21, 352, 5, 0, 3), (368, 38, 6, 0, 0), (406, 83, 7, 0, 0)]; + let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + assert!(return_value.0); + assert!(return_value.1.is_empty()); + assert!(1 == 2); } } From 6b1857b7093707dc7b0f64ede5b6a445966320ff Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 10:02:22 -0700 Subject: [PATCH 13/23] added test data to repo --- vdj_ann/src/transcript.rs | 8 +++----- .../test/inputs/test_productive_is_valid_ref.fa | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 vdj_ann/test/inputs/test_productive_is_valid_ref.fa diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 9d710c29..ebdb1287 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -380,9 +380,8 @@ mod tests { use debruijn::dna_string::DnaString; use refx::RefData; // TODO use a smaller ref checked in to the repo - let refdata = RefData::from_fasta(&String::from( - "/mnt/home/nima.mousavi/yard/flex/runs/20230823_wasted_data_contig/ref.fa", - )); + let refdata = + RefData::from_fasta(&String::from("test/inputs/test_productive_is_valid_ref.fa")); let mut log: Vec = vec![]; @@ -439,12 +438,11 @@ mod tests { ] .contains(item))); + // Productive let b = DnaString::from_dna_string("GGACCAAAATTCAAAGACAAAATGCATTGTCAAGTGCAGATTTTCAGCTTCCTGCTAATCAGTGCCTCAGTCATAATGTCCAGAGGACAAATTGTTCTCACCCAGTCTCCAGCAATCATGTCTGCATCTCCAGGGGAGAAGGTCACCATAACCTGCAGTGCCAGCTCAAGTGTAAGTTACATGCACTGGTTCCAGCAGAAGCCAGGCACTTCTCCCAAACTCTGGATTTATAGCACATCCAACCTGGCTTCTGGAGTCCCTGCTCGCTTCAGTGGCAGTGGATCTGGGACCTCTTACTCTCTCACAATCAGCCGAATGGAGGCTGAAGATGCTGCCACTTATTACTGCCAGCAAAGGAGTAGTTACCCGCTCACGTTCGGTGCTGGGACCAAGCTGGAGCTGAAACGGGCTGATGCTGCACCAACTGTATCCATCTTCCCACCATCCAGTGAGCAGTTAACATCTGGAGGTGCCTCAGTCGTGTGCTTC"); let ann = [(21, 352, 5, 0, 3), (368, 38, 6, 0, 0), (406, 83, 7, 0, 0)]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(return_value.0); assert!(return_value.1.is_empty()); - - assert!(1 == 2); } } diff --git a/vdj_ann/test/inputs/test_productive_is_valid_ref.fa b/vdj_ann/test/inputs/test_productive_is_valid_ref.fa new file mode 100644 index 00000000..1b236475 --- /dev/null +++ b/vdj_ann/test/inputs/test_productive_is_valid_ref.fa @@ -0,0 +1,16 @@ +>140|IGHV3-8 ENSMUST00000103483|IGHV3-8|L-REGION+V-REGION|IG|IGH|None|00 +ATGATGGTGTTAAGTCTTCTGTACCTGTTGACAGCCCTTCCGGGTATCCTGTCAGAGGTGCAGCTTCAGGAGTCAGGACCTGGCCTGGCAAAACCTTCTCAGACTCTGTCCCTCACCTGTTCTGTCACTGGCTACTCCATCACCAGTGATTACTGGAACTGGATCCGGAAATTCCCAGGGAATAAACTTGAGTACATGGGGTACATAAGCTACAGTGGTAGCACTTACTACAATCCATCTCTCAAAAGTCGAATCTCCATAACTCGAGACACATCCAAGAACCAGTATTACCTGCAGTTGAATTCTGTGACTACTGAGGACACAGCCACATATTACTGTGCAAGATACACAGTGTGGAGTCTTCAGTGTAGGCCCAGACATAAACCTCCTTGTA +>30|IGHJ3 ENSMUST00000103428|IGHJ3|J-REGION|IG|IGH|None|00 +CCTGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG +>47|IGHV1-26 ENSMUST00000103510|IGHV1-26|L-REGION+V-REGION|IG|IGH|None|00 +ATGGGATGGAGCTGGATCTTTCTCTTTCTCCTGTCAGGAACTGCAGGTGTCCTCTCTGAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGTAAGGCTTCTGGATACACGTTCACTGACTACTACATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGAGATATTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCAGTCTATTACTGTGCAAGA +>31|IGHJ4 ENSMUST00000103427|IGHJ4|J-REGION|IG|IGH|None|00 +ATTACTATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAG +>32|IGHM ENSMUST00000103426|IGHM|C-REGION|IG|IGH|M|00 +AGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCAGGTGTTGCTGTCTCCCAAGAGCATCCTTGAAGGTTCAGATGAATACCTGGTATGCAAAATCCACTACGGAGGCAAAAACAAAGATCTGCATGTGCCCATTCCAGCTGTCGCAGAGATGAACCCCAATGTAAATGTGTTCGTCCCACCACGGGATGGCTTCTCTGGCCCTGCACCACGCAAGTCTAAACTCATCTGCGAGGCCACGAACTTCACTCCAAAACCGATCACAGTATCCTGGCTAAAGGATGGGAAGCTCGTGGAATCTGGCTTCACCACAGATCCGGTGACCATCGAGAACAAAGGATCCACACCCCAAACCTACAAGGTCATAAGCACACTTACCATCTCTGAAATCGACTGGCTGAACCTGAATGTGTACACCTGCCGTGTGGATCACAGGGGTCTCACCTTCTTGAAGAACGTGTCCTCCACATGTGCTGCCAGTCCCTCCACAGACATCCTAACCTTCACCATCCCCCCCTCCTTTGCCGACATCTTCCTCAGCAAGTCCGCTAACCTGACCTGTCTGGTCTCAAACCTGGCAACCTATGAAACCCTGAATATCTCCTGGGCTTCTCAAAGTGGTGAACCACTGGAAACCAAAATTAAAATCATGGAAAGCCATCCCAATGGCACCTTCAGTGCTAAGGGTGTGGCTAGTGTTTGTGTGGAAGACTGGAATAACAGGAAGGAATTTGTGTGTACTGTGACTCACAGGGATCTGCCTTCACCACAGAAGAAATTCATCTCAAAACCCAATGAGGTGCACAAACATCCACCTGCTGTGTACCTGCTGCCACCAGCTCGTGAGCAACTGAACCTGAGGGAGTCAGCCACAGTCACCTGCCTGGTGAAGGGCTTCTCTCCTGCAGACATCAGTGTGCAGTGGCTTCAGAGAGGGCAACTCTTGCCCCAAGAGAAGTATGTGACCAGTGCCCCGATGCCAGAGCCTGGGGCCCCAGGCTTCTACTTTACCCACAGCATCCTGACTGTGACAGAGGAGGAATGGAACTCCGGAGAGACCTATACCTGTGTTGTAGGCCACGAGGCCCTGCCACACCTGGTGACCGAGAGGACCGTGGACAAGTCCACTGGTAAACCCACACTGTACAATGTCTCCCTGATCATGTCTGACACAGGCGGCACCTGCTAT +>259|IGKV4-57 ENSMUST00000103357|IGKV4-57|L-REGION+V-REGION|IG|IGK|None|00 +ATGCATTTTCAAGTGCAGATTTTCAGCTTCCTGCTAATCAGTGCCTCAGTCATAATGTCCAGAGGACAAATTGTTCTCACCCAGTCTCCAGCAATCATGTCTGCATCTCCAGGGGAGAAGGTCACCATAACCTGCAGTGCCAGCTCAAGTGTAAGTTACATGCACTGGTTCCAGCAGAAGCCAGGCACTTCTCCCAAACTCTGGATTTATAGCACATCCAACCTGGCTTCTGGAGTCCCTGCTCGCTTCAGTGGCAGTGGATCTGGGACCTCTTACTCTCTCACAATCAGCCGAATGGAGGCTGAAGATGCTGCCACTTATTACTGCCAGCAAAGGAGTAGTTACCCACCCA +>185|IGKJ5 ENSMUST00000199459|IGKJ5|J-REGION|IG|IGK|None|00 +GCTCACGTTCGGTGCTGGGACCAAGCTGGAGCTGAAAC +>180|IGKC ENSMUST00000103410|IGKC|C-REGION|IG|IGK|None|00 +GGGCTGATGCTGCACCAACTGTATCCATCTTCCCACCATCCAGTGAGCAGTTAACATCTGGAGGTGCCTCAGTCGTGTGCTTCTTGAACAACTTCTACCCCAAAGACATCAATGTCAAGTGGAAGATTGATGGCAGTGAACGACAAAATGGCGTCCTGAACAGTTGGACTGATCAGGACAGCAAAGACAGCACCTACAGCATGAGCAGCACCCTCACGTTGACCAAGGACGAGTATGAACGACATAACAGCTATACCTGTGAGGCCACTCACAAGACATCAACTTCACCCATTGTCAAGAGCTTCAACAGGAATGAGTGT From 067eb0f142d1694054357951e469d024e0eb096d Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 10:02:43 -0700 Subject: [PATCH 14/23] remove comment --- vdj_ann/src/transcript.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index ebdb1287..381558c9 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -379,7 +379,7 @@ mod tests { fn test_is_valid() { use debruijn::dna_string::DnaString; use refx::RefData; - // TODO use a smaller ref checked in to the repo + let refdata = RefData::from_fasta(&String::from("test/inputs/test_productive_is_valid_ref.fa")); From 11424d93079d672a1c1dc4f8f3f10f24e98d1dca Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 10:09:41 -0700 Subject: [PATCH 15/23] use object return instead of tuple --- vdj_ann/src/annotate.rs | 18 +++++++---------- vdj_ann/src/transcript.rs | 42 ++++++++++++++++++++++++++------------- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index 1fd22353..9fa26c7f 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -8,21 +8,17 @@ use crate::transcript::is_valid; use align_tools::affine_align; use amino::{aa_seq, have_start}; use bio_edit::alignment::AlignmentOperation::{Del, Ins, Match, Subst, Xclip, Yclip}; -use debruijn::{ - dna_string::{DnaString, DnaStringSlice}, - kmer::{Kmer12, Kmer20}, - Mer, Vmer, -}; +use debruijn::dna_string::{DnaString, DnaStringSlice}; +use debruijn::kmer::{Kmer12, Kmer20}; +use debruijn::{Mer, Vmer}; use io_utils::{fwrite, fwriteln}; use itertools::Itertools; use serde::{Deserialize, Serialize}; use stats_utils::percent_ratio; +use std::cmp::{max, min}; use std::fmt::Write as _; -use std::{ - cmp::{max, min}, - fs::File, - io::{BufWriter, Write}, -}; +use std::fs::File; +use std::io::{BufWriter, Write}; use string_utils::{stringme, strme, TextUtils}; use vdj_types::{VdjChain, VdjRegion}; use vector_utils::{ @@ -3179,7 +3175,7 @@ impl ContigAnnotation { let mut ann = Vec::<(i32, i32, i32, i32, i32)>::new(); annotate_seq(b, refdata, &mut ann, true, false, true); let mut log = Vec::::new(); - let productive = is_valid(b, refdata, &ann, false, &mut log, is_gd).0; + let productive = is_valid(b, refdata, &ann, false, &mut log, is_gd).productive; ContigAnnotation::from_annotate_seq( b, q, diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 381558c9..5dff116b 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -27,6 +27,11 @@ pub enum UnproductiveContigCause { TooLarge, } +pub struct ContigStatus { + pub productive: bool, + pub unproductive_cause: Vec, +} + pub fn is_valid( b: &DnaString, refdata: &RefData, @@ -34,7 +39,7 @@ pub fn is_valid( logme: bool, log: &mut Vec, is_gd: Option, -) -> (bool, Vec) { +) -> ContigStatus { // Unwrap gamma/delta mode flag let gd_mode = is_gd.unwrap_or(false); let refs = &refdata.refs; @@ -151,7 +156,10 @@ pub fn is_valid( if logme { fwriteln!(log, "did not find CDR3"); } - return (false, vec![UnproductiveContigCause::NoCdr3]); + return ContigStatus { + productive: false, + unproductive_cause: vec![UnproductiveContigCause::NoCdr3], + }; } let mut too_large = false; const MIN_DELTA: i32 = -25; @@ -208,12 +216,18 @@ pub fn is_valid( ret_vec.push(UnproductiveContigCause::NotFull); } if full && !too_large && !misordered { - return (true, vec![]); + return ContigStatus { + productive: true, + unproductive_cause: vec![], + }; } } ret_vec.sort_unstable(); ret_vec.dedup(); - (false, ret_vec) + return ContigStatus { + productive: false, + unproductive_cause: ret_vec, + }; } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ @@ -393,9 +407,9 @@ mod tests { (418, 48, 1, 0, 2), ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.0); + assert!(!return_value.productive); assert!(return_value - .1 + .unproductive_cause .iter() .all(|item| vec![UnproductiveContigCause::NoCdr3].contains(item))); @@ -403,9 +417,9 @@ mod tests { let b = DnaString::from_dna_string("GAACACATGCCCAATGTCCTCTCCACAGACACTGAACACACTGACTCCAACCATGGGGTGGAGTCTGGATCTTTTTCTTCCTCCTGTCAGGAACTGCAGGTGTCCACTCTGAGGTCCAGCTGCAACAGTCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGCAAGGCTTCTGGCTACACATTCACTGACTACTACATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGACTTGTTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCGGTCTATTACTGTGCAAGAAGGGCTAGGGTAACTGGGATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); let ann = [(64, 340, 2, 11, 11)]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.0); + assert!(!return_value.productive); assert!(return_value - .1 + .unproductive_cause .iter() .all(|item| vec![UnproductiveContigCause::NotFull].contains(item))); @@ -416,8 +430,8 @@ mod tests { (470, 211, 4, 0, 0), ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.0); - assert!(return_value.1.iter().all(|item| vec![ + assert!(!return_value.productive); + assert!(return_value.unproductive_cause.iter().all(|item| vec![ UnproductiveContigCause::NotFull, UnproductiveContigCause::TooLarge ] @@ -430,8 +444,8 @@ mod tests { (470, 211, 4, 0, 0), ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.0); - assert!(return_value.1.iter().all(|item| vec![ + assert!(!return_value.productive); + assert!(return_value.unproductive_cause.iter().all(|item| vec![ UnproductiveContigCause::NotFull, UnproductiveContigCause::TooLarge, UnproductiveContigCause::Misordered @@ -442,7 +456,7 @@ mod tests { let b = DnaString::from_dna_string("GGACCAAAATTCAAAGACAAAATGCATTGTCAAGTGCAGATTTTCAGCTTCCTGCTAATCAGTGCCTCAGTCATAATGTCCAGAGGACAAATTGTTCTCACCCAGTCTCCAGCAATCATGTCTGCATCTCCAGGGGAGAAGGTCACCATAACCTGCAGTGCCAGCTCAAGTGTAAGTTACATGCACTGGTTCCAGCAGAAGCCAGGCACTTCTCCCAAACTCTGGATTTATAGCACATCCAACCTGGCTTCTGGAGTCCCTGCTCGCTTCAGTGGCAGTGGATCTGGGACCTCTTACTCTCTCACAATCAGCCGAATGGAGGCTGAAGATGCTGCCACTTATTACTGCCAGCAAAGGAGTAGTTACCCGCTCACGTTCGGTGCTGGGACCAAGCTGGAGCTGAAACGGGCTGATGCTGCACCAACTGTATCCATCTTCCCACCATCCAGTGAGCAGTTAACATCTGGAGGTGCCTCAGTCGTGTGCTTC"); let ann = [(21, 352, 5, 0, 3), (368, 38, 6, 0, 0), (406, 83, 7, 0, 0)]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(return_value.0); - assert!(return_value.1.is_empty()); + assert!(return_value.productive); + assert!(return_value.unproductive_cause.is_empty()); } } From 7ea1499637892531e594bba00e1b64381b6fb5ea Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 14:48:55 -0700 Subject: [PATCH 16/23] don't jump out when no cdr3 --- vdj_ann/src/transcript.rs | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 5dff116b..21ed0a9f 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -27,6 +27,7 @@ pub enum UnproductiveContigCause { TooLarge, } +#[derive(Debug)] pub struct ContigStatus { pub productive: bool, pub unproductive_cause: Vec, @@ -45,6 +46,8 @@ pub fn is_valid( let refs = &refdata.refs; let rheaders = &refdata.rheaders; let mut ret_vec = Vec::new(); + let mut never_full = true; + // two passes, one for light chains and one for heavy chains for pass in 0..2 { let mut m = "A"; if pass == 1 { @@ -104,12 +107,13 @@ pub fn is_valid( unique_sort(&mut vstarts); unique_sort(&mut jstops); let mut full = false; - for pass in 1..3 { - if pass == 2 && full { + // 2 passes to check frameshifts (finding the start/stop codon) + for inner_pass in 1..3 { + if inner_pass == 2 && full { continue; } let mut msg = ""; - if pass == 2 { + if inner_pass == 2 { msg = "frameshifted "; }; for start in vstarts.iter() { @@ -118,7 +122,8 @@ pub fn is_valid( } for stop in jstops.iter() { let n = stop - start; - if pass == 2 || n % 3 == 1 { + // on second pass, go through with checking for stop codon regardless of n % 3 value + if inner_pass == 2 || n % 3 == 1 { let mut stop_codon = false; // shouldn't it be stop-3+1???????????????????????????????? for j in (*start..stop - 3).step_by(3) { @@ -127,8 +132,9 @@ pub fn is_valid( } } if !stop_codon { - if pass == 1 { + if inner_pass == 1 { full = true; + never_full = false; } if logme { fwriteln!( @@ -156,16 +162,13 @@ pub fn is_valid( if logme { fwriteln!(log, "did not find CDR3"); } - return ContigStatus { - productive: false, - unproductive_cause: vec![UnproductiveContigCause::NoCdr3], - }; + ret_vec.push(UnproductiveContigCause::NoCdr3); } let mut too_large = false; const MIN_DELTA: i32 = -25; const MIN_DELTA_IGH: i32 = -55; const MAX_DELTA: i32 = 35; - if first_vstart >= 0 && last_jstop >= 0 { + if first_vstart >= 0 && last_jstop >= 0 && !cdr3.is_empty() { let delta = (last_jstop_len + first_vstart_len + 3 * cdr3[0].1.len() as i32 - 20) - (last_jstop - first_vstart); if logme { @@ -209,19 +212,21 @@ pub fn is_valid( } ret_vec.push(UnproductiveContigCause::TooLarge); } - if !full { - if logme { - fwriteln!(log, "not full"); - } - ret_vec.push(UnproductiveContigCause::NotFull); - } - if full && !too_large && !misordered { + if full && !too_large && !misordered && ret_vec.is_empty() { return ContigStatus { productive: true, unproductive_cause: vec![], }; } } + if never_full { + if logme { + fwriteln!(log, "not full"); + } + + ret_vec.push(UnproductiveContigCause::NotFull); + } + ret_vec.sort_unstable(); ret_vec.dedup(); return ContigStatus { From 78da3a3cd860c2d20feb85a409b7b35e26754a6b Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 14:52:41 -0700 Subject: [PATCH 17/23] remove extra return --- vdj_ann/src/transcript.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 21ed0a9f..fdfdbd05 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -229,10 +229,10 @@ pub fn is_valid( ret_vec.sort_unstable(); ret_vec.dedup(); - return ContigStatus { + ContigStatus { productive: false, unproductive_cause: ret_vec, - }; + } } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ From d7f06d4742c841ed6572e869e6abd1a7407c9562 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Thu, 24 Aug 2023 14:55:52 -0700 Subject: [PATCH 18/23] fix clippy error --- vdj_ann/src/transcript.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index fdfdbd05..b19a52ea 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -400,7 +400,7 @@ mod tests { use refx::RefData; let refdata = - RefData::from_fasta(&String::from("test/inputs/test_productive_is_valid_ref.fa")); + RefData::from_fasta(String::from("test/inputs/test_productive_is_valid_ref.fa")); let mut log: Vec = vec![]; From 914ae09516edede87da11527372e7f1f18c0c4aa Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 30 Aug 2023 13:16:36 -0700 Subject: [PATCH 19/23] parsing ann into a rust object for better readability --- vdj_ann/src/transcript.rs | 89 ++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index b19a52ea..9a1ce8f1 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -33,6 +33,28 @@ pub struct ContigStatus { pub unproductive_cause: Vec, } +// ann: { ( start on sequence, match length, ref tig, start on ref tig, mismatches on sequence ) }. + +#[derive(Debug)] +pub struct Annotation { + pub seq_start: usize, // Start position on sequence + pub match_len: usize, // Match length + pub ref_tig: usize, // Reference tig index (ordered in ref.fa starting from 0) + pub tig_start: usize, // Start position on reference tig + pub mismatches: usize, +} +impl From<(i32, i32, i32, i32, i32)> for Annotation { + fn from(ann_element: (i32, i32, i32, i32, i32)) -> Self { + Annotation { + seq_start: ann_element.0 as usize, + match_len: ann_element.1 as usize, + ref_tig: ann_element.2 as usize, + tig_start: ann_element.3 as usize, + mismatches: ann_element.4 as usize, + } + } +} + pub fn is_valid( b: &DnaString, refdata: &RefData, @@ -61,46 +83,43 @@ pub fn is_valid( let mut last_jstop_len: i32 = -1; let mut igh = false; for j in 0..ann.len() { - let l = ann[j].0 as usize; - let len = ann[j].1 as usize; - let t = ann[j].2 as usize; - let p = ann[j].3 as usize; - if rheaders[t].contains("IGH") { + let annot = Annotation::from(ann[j]); + if rheaders[annot.ref_tig].contains("IGH") { igh = true; } - if !rheaders[t].contains("5'UTR") + if !rheaders[annot.ref_tig].contains("5'UTR") && ((m == "A" - && (rheaders[t].contains("TRAV") - || (rheaders[t].contains("TRGV") && gd_mode) - || rheaders[t].contains("IGHV"))) + && (rheaders[annot.ref_tig].contains("TRAV") + || (rheaders[annot.ref_tig].contains("TRGV") && gd_mode) + || rheaders[annot.ref_tig].contains("IGHV"))) || (m == "B" - && (rheaders[t].contains("TRBV") - || (rheaders[t].contains("TRDV") && gd_mode) - || rheaders[t].contains("IGLV") - || rheaders[t].contains("IGKV")))) + && (rheaders[annot.ref_tig].contains("TRBV") + || (rheaders[annot.ref_tig].contains("TRDV") && gd_mode) + || rheaders[annot.ref_tig].contains("IGLV") + || rheaders[annot.ref_tig].contains("IGKV")))) { if first_vstart < 0 { - first_vstart = l as i32; - first_vstart_len = (refs[t].len() - p) as i32; + first_vstart = annot.seq_start as i32; + first_vstart_len = (refs[annot.ref_tig].len() - annot.tig_start) as i32; } - if p == 0 { - vstarts.push(l as i32); + if annot.tig_start == 0 { + vstarts.push(annot.seq_start as i32); } } if (m == "A" - && (rheaders[t].contains("TRAJ") - || (rheaders[t].contains("TRGJ") && gd_mode) - || rheaders[t].contains("IGHJ"))) + && (rheaders[annot.ref_tig].contains("TRAJ") + || (rheaders[annot.ref_tig].contains("TRGJ") && gd_mode) + || rheaders[annot.ref_tig].contains("IGHJ"))) || (m == "B" - && (rheaders[t].contains("TRBJ") - || (rheaders[t].contains("TRDJ") && gd_mode) - || rheaders[t].contains("IGLJ") - || rheaders[t].contains("IGKJ"))) + && (rheaders[annot.ref_tig].contains("TRBJ") + || (rheaders[annot.ref_tig].contains("TRDJ") && gd_mode) + || rheaders[annot.ref_tig].contains("IGLJ") + || rheaders[annot.ref_tig].contains("IGKJ"))) { - last_jstop = (l + len) as i32; - last_jstop_len = (p + len) as i32; - if p + len == refs[t].len() { - jstops.push((l + len) as i32); + last_jstop = (annot.seq_start + annot.match_len) as i32; + last_jstop_len = (annot.tig_start + annot.match_len) as i32; + if annot.tig_start + annot.match_len == refs[annot.ref_tig].len() { + jstops.push((annot.seq_start + annot.match_len) as i32); } } } @@ -187,14 +206,14 @@ pub fn is_valid( } let mut misordered = false; for j1 in 0..ann.len() { - let t1 = ann[j1].2 as usize; + let annot1 = Annotation::from(ann[j1]); for j2 in j1 + 1..ann.len() { - let t2 = ann[j2].2 as usize; - if (refdata.is_j(t1) && refdata.is_v(t2)) - || (refdata.is_j(t1) && refdata.is_u(t2)) - || (refdata.is_j(t1) && refdata.is_d(t2)) - || (refdata.is_v(t1) && refdata.is_u(t2)) - || (refdata.is_c(t1) && !refdata.is_c(t2)) + let annot2 = Annotation::from(ann[j2]); + if (refdata.is_j(annot1.ref_tig) && refdata.is_v(annot2.ref_tig)) + || (refdata.is_j(annot1.ref_tig) && refdata.is_u(annot2.ref_tig)) + || (refdata.is_j(annot1.ref_tig) && refdata.is_d(annot2.ref_tig)) + || (refdata.is_v(annot1.ref_tig) && refdata.is_u(annot2.ref_tig)) + || (refdata.is_c(annot1.ref_tig) && !refdata.is_c(annot2.ref_tig)) { misordered = true; } From dbd054defe90db72858460d04498d83d7c3376c4 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 30 Aug 2023 14:24:31 -0700 Subject: [PATCH 20/23] use bitflags for unproductive cause --- Cargo.lock | 1 + vdj_ann/Cargo.toml | 1 + vdj_ann/src/transcript.rs | 92 ++++++++++++++++++++++----------------- 3 files changed, 55 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f71ce29e..caafb3ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1354,6 +1354,7 @@ dependencies = [ "align_tools", "amino", "bio_edit", + "bitflags 2.4.0", "debruijn", "fasta_tools", "hyperbase", diff --git a/vdj_ann/Cargo.toml b/vdj_ann/Cargo.toml index 103f8706..6c0de819 100644 --- a/vdj_ann/Cargo.toml +++ b/vdj_ann/Cargo.toml @@ -26,3 +26,4 @@ stats_utils = { version = "0.1", path = "../stats_utils" } string_utils = { version = "0.1", path = "../string_utils" } vector_utils = { version = "0.1", path = "../vector_utils" } vdj_types = { version = "0.2", path = "../vdj_types" } +bitflags = "2.4.0" diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 9a1ce8f1..112469b5 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -5,6 +5,7 @@ use crate::annotate::get_cdr3_using_ann; use crate::refx::RefData; use amino::{have_start, have_stop}; +use bitflags::bitflags; use debruijn::dna_string::DnaString; use debruijn::kmer::Kmer20; use debruijn::{Mer, Vmer}; @@ -19,18 +20,30 @@ use vector_utils::{lower_bound1_3, unique_sort}; // TEST FOR VALID VDJ SEQUENCE // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -#[derive(Debug, PartialEq, Ord, PartialOrd, Eq)] -pub enum UnproductiveContigCause { - NoCdr3, - Misordered, - NotFull, - TooLarge, +// #[derive(Debug, PartialEq, Ord, PartialOrd, Eq)] +// pub enum UnproductiveContigCause { +// NoCdr3, +// Misordered, +// NotFull, +// TooLarge, +// } +bitflags! { + #[derive(Debug, Default,PartialEq)] + pub struct UnproductiveContig: u16 { + const NOT_FULL_LEN = 1u16; + const MISSING_V_START = 2u16; + const PREMATURE_STOP = 4u16; + const FRAMESHIFT = 8u16; + const NO_CDR3 = 16u16; + const SIZE_DELTA = 32u16; + const MISORDERED = 64u16; + } } #[derive(Debug)] pub struct ContigStatus { pub productive: bool, - pub unproductive_cause: Vec, + pub unproductive_cause: UnproductiveContig, } // ann: { ( start on sequence, match length, ref tig, start on ref tig, mismatches on sequence ) }. @@ -67,7 +80,14 @@ pub fn is_valid( let gd_mode = is_gd.unwrap_or(false); let refs = &refdata.refs; let rheaders = &refdata.rheaders; - let mut ret_vec = Vec::new(); + // let not_full_length = false; + // let missing_vstart = false; + // let premature_stop = true; + // let frameshift_mut = false; + // let missing_cdr3 = false; + // let too_large = true; + // let misordered = false; + let mut contig_status: UnproductiveContig = Default::default(); let mut never_full = true; // two passes, one for light chains and one for heavy chains for pass in 0..2 { @@ -181,7 +201,7 @@ pub fn is_valid( if logme { fwriteln!(log, "did not find CDR3"); } - ret_vec.push(UnproductiveContigCause::NoCdr3); + contig_status |= UnproductiveContig::NO_CDR3; } let mut too_large = false; const MIN_DELTA: i32 = -25; @@ -223,18 +243,18 @@ pub fn is_valid( if logme { fwriteln!(log, "misordered"); } - ret_vec.push(UnproductiveContigCause::Misordered); + contig_status |= UnproductiveContig::MISORDERED; } if too_large { if logme { fwriteln!(log, "too large"); } - ret_vec.push(UnproductiveContigCause::TooLarge); + contig_status |= UnproductiveContig::SIZE_DELTA; } - if full && !too_large && !misordered && ret_vec.is_empty() { + if full && !too_large && !misordered && contig_status.is_empty() { return ContigStatus { productive: true, - unproductive_cause: vec![], + unproductive_cause: contig_status, }; } } @@ -243,14 +263,12 @@ pub fn is_valid( fwriteln!(log, "not full"); } - ret_vec.push(UnproductiveContigCause::NotFull); + contig_status |= UnproductiveContig::NOT_FULL_LEN; } - ret_vec.sort_unstable(); - ret_vec.dedup(); ContigStatus { productive: false, - unproductive_cause: ret_vec, + unproductive_cause: contig_status, } } @@ -423,7 +441,7 @@ mod tests { let mut log: Vec = vec![]; - // NoCdr3 + // NO_CDR3 let b = DnaString::from_dna_string("ACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG"); let ann = [ (54, 148, 0, 0, 16), @@ -432,22 +450,19 @@ mod tests { ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.productive); - assert!(return_value - .unproductive_cause - .iter() - .all(|item| vec![UnproductiveContigCause::NoCdr3].contains(item))); + assert_eq!(return_value.unproductive_cause, UnproductiveContig::NO_CDR3); - // NotFull + // NOT_FULL_LEN let b = DnaString::from_dna_string("GAACACATGCCCAATGTCCTCTCCACAGACACTGAACACACTGACTCCAACCATGGGGTGGAGTCTGGATCTTTTTCTTCCTCCTGTCAGGAACTGCAGGTGTCCACTCTGAGGTCCAGCTGCAACAGTCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGCAAGGCTTCTGGCTACACATTCACTGACTACTACATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGACTTGTTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCGGTCTATTACTGTGCAAGAAGGGCTAGGGTAACTGGGATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); let ann = [(64, 340, 2, 11, 11)]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.productive); - assert!(return_value - .unproductive_cause - .iter() - .all(|item| vec![UnproductiveContigCause::NotFull].contains(item))); + assert_eq!( + return_value.unproductive_cause, + UnproductiveContig::NOT_FULL_LEN + ); - // [NotFull, TooLarge] + // [NOT_FULL_LEN, SIZE_DELTA] let ann = [ (64, 340, 2, 11, 11), (416, 54, 3, 0, 4), @@ -455,11 +470,10 @@ mod tests { ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.productive); - assert!(return_value.unproductive_cause.iter().all(|item| vec![ - UnproductiveContigCause::NotFull, - UnproductiveContigCause::TooLarge - ] - .contains(item))); + assert_eq!( + return_value.unproductive_cause, + UnproductiveContig::NOT_FULL_LEN | UnproductiveContig::SIZE_DELTA + ); // [Misordered, NotFull, TooLarge] let ann = [ @@ -469,12 +483,12 @@ mod tests { ]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); assert!(!return_value.productive); - assert!(return_value.unproductive_cause.iter().all(|item| vec![ - UnproductiveContigCause::NotFull, - UnproductiveContigCause::TooLarge, - UnproductiveContigCause::Misordered - ] - .contains(item))); + assert_eq!( + return_value.unproductive_cause, + UnproductiveContig::NOT_FULL_LEN + | UnproductiveContig::SIZE_DELTA + | UnproductiveContig::MISORDERED + ); // Productive let b = DnaString::from_dna_string("GGACCAAAATTCAAAGACAAAATGCATTGTCAAGTGCAGATTTTCAGCTTCCTGCTAATCAGTGCCTCAGTCATAATGTCCAGAGGACAAATTGTTCTCACCCAGTCTCCAGCAATCATGTCTGCATCTCCAGGGGAGAAGGTCACCATAACCTGCAGTGCCAGCTCAAGTGTAAGTTACATGCACTGGTTCCAGCAGAAGCCAGGCACTTCTCCCAAACTCTGGATTTATAGCACATCCAACCTGGCTTCTGGAGTCCCTGCTCGCTTCAGTGGCAGTGGATCTGGGACCTCTTACTCTCTCACAATCAGCCGAATGGAGGCTGAAGATGCTGCCACTTATTACTGCCAGCAAAGGAGTAGTTACCCGCTCACGTTCGGTGCTGGGACCAAGCTGGAGCTGAAACGGGCTGATGCTGCACCAACTGTATCCATCTTCCCACCATCCAGTGAGCAGTTAACATCTGGAGGTGCCTCAGTCGTGTGCTTC"); From 6b9c957ba6f58936b27de790d50728abf93fdad1 Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 30 Aug 2023 14:43:13 -0700 Subject: [PATCH 21/23] added new error modes --- vdj_ann/src/transcript.rs | 121 +++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 53 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 112469b5..50ced22f 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -81,7 +81,7 @@ pub fn is_valid( let refs = &refdata.refs; let rheaders = &refdata.rheaders; // let not_full_length = false; - // let missing_vstart = false; + let mut missing_vstart = true; // let premature_stop = true; // let frameshift_mut = false; // let missing_cdr3 = false; @@ -95,6 +95,7 @@ pub fn is_valid( if pass == 1 { m = "B"; } + println!(">> Pass: {:?}, m: {:?}", pass, m); let mut vstarts = Vec::::new(); let mut jstops = Vec::::new(); let mut first_vstart: i32 = -1; @@ -145,9 +146,12 @@ pub fn is_valid( } unique_sort(&mut vstarts); unique_sort(&mut jstops); + println!(">>>> vstarts: {:?}", vstarts); + println!(">>>> jstops: {:?}", jstops); let mut full = false; // 2 passes to check frameshifts (finding the start/stop codon) for inner_pass in 1..3 { + println!(">>>>>> inner_pass: {:?}", inner_pass); if inner_pass == 2 && full { continue; } @@ -159,6 +163,7 @@ pub fn is_valid( if !have_start(b, *start as usize) { continue; } + missing_vstart = false; for stop in jstops.iter() { let n = stop - start; // on second pass, go through with checking for stop codon regardless of n % 3 value @@ -174,6 +179,8 @@ pub fn is_valid( if inner_pass == 1 { full = true; never_full = false; + } else { + contig_status |= UnproductiveContig::FRAMESHIFT; } if logme { fwriteln!( @@ -183,13 +190,16 @@ pub fn is_valid( b.len() ); } - } else if logme { - fwriteln!( - log, - "{}full length stopped transcript of length {}", - msg, - b.len() - ); + } else { + contig_status |= UnproductiveContig::PREMATURE_STOP; + if logme { + fwriteln!( + log, + "{}full length stopped transcript of length {}", + msg, + b.len() + ); + } } } } @@ -258,6 +268,9 @@ pub fn is_valid( }; } } + if missing_vstart { + contig_status |= UnproductiveContig::MISSING_V_START; + } if never_full { if logme { fwriteln!(log, "not full"); @@ -441,60 +454,62 @@ mod tests { let mut log: Vec = vec![]; - // NO_CDR3 - let b = DnaString::from_dna_string("ACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG"); - let ann = [ - (54, 148, 0, 0, 16), - (205, 246, 0, 148, 58), - (418, 48, 1, 0, 2), - ]; - let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.productive); - assert_eq!(return_value.unproductive_cause, UnproductiveContig::NO_CDR3); + // // NO_CDR3 + // let b = DnaString::from_dna_string("ACATCTCTCTCATTAGAGGTTGATCTTTGAGGAAAACAGGGTGTTGCCTAAAGGATGAAAGTGTTGAGTCTGTTGTACCTGTTGACAGCCATTCCTGGTATCCTGTCTGATGTACAGCTTCAGGAGTCAGGACCTGGCCTCGTGAAACCTTCTCAGTCTCTGTCTCTCACCTGCTCTGTCACTGGCTACTCCATCACCAGTGGTTATTACTGGAACTGGATCCGGCAGTTTCCAGGAAACAAACTGGAATGGATGGGCTACATAAGCTACGACGGTAGCAATAACTACAACCCATCTCTCAAAAATCGAATCTCCATCACTCGTGACACATCTAAGAACCAGTTTTTCCTGAAGTTGAATTCTGTGACTACTGAGGACACAGCTACATATTACTGTGCAAGATCTACTATGATTACGACGGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG"); + // let ann = [ + // (54, 148, 0, 0, 16), + // (205, 246, 0, 148, 58), + // (418, 48, 1, 0, 2), + // ]; + // let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + // assert!(!return_value.productive); + // assert_eq!(return_value.unproductive_cause, UnproductiveContig::NO_CDR3); - // NOT_FULL_LEN - let b = DnaString::from_dna_string("GAACACATGCCCAATGTCCTCTCCACAGACACTGAACACACTGACTCCAACCATGGGGTGGAGTCTGGATCTTTTTCTTCCTCCTGTCAGGAACTGCAGGTGTCCACTCTGAGGTCCAGCTGCAACAGTCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGCAAGGCTTCTGGCTACACATTCACTGACTACTACATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGACTTGTTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCGGTCTATTACTGTGCAAGAAGGGCTAGGGTAACTGGGATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); - let ann = [(64, 340, 2, 11, 11)]; - let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.productive); - assert_eq!( - return_value.unproductive_cause, - UnproductiveContig::NOT_FULL_LEN - ); + // // NOT_FULL_LEN + // let b = DnaString::from_dna_string("GAACACATGCCCAATGTCCTCTCCACAGACACTGAACACACTGACTCCAACCATGGGGTGGAGTCTGGATCTTTTTCTTCCTCCTGTCAGGAACTGCAGGTGTCCACTCTGAGGTCCAGCTGCAACAGTCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGCAAGGCTTCTGGCTACACATTCACTGACTACTACATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGACTTGTTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCGGTCTATTACTGTGCAAGAAGGGCTAGGGTAACTGGGATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAGAGAGTCAGTCCTTCCCAAATGTCTTCCCCCTCGTCTCCTGCGAGAGCCCCCTGTCTGATAAGAATCTGGTGGCCATGGGCTGCCTGGCCCGGGACTTCCTGCCCAGCACCATTTCCTTCACCTGGAACTACCAGAACAACACTGAAGTCATCCAGGGTATCAGAACCTTCCCAACACTGAGGACAGGGGGCAAGTACCTAGCCACCTCGCA"); + // let ann = [(64, 340, 2, 11, 11)]; + // let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + // assert!(!return_value.productive); + // assert_eq!( + // return_value.unproductive_cause, + // UnproductiveContig::NOT_FULL_LEN + // ); - // [NOT_FULL_LEN, SIZE_DELTA] - let ann = [ - (64, 340, 2, 11, 11), - (416, 54, 3, 0, 4), - (470, 211, 4, 0, 0), - ]; - let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.productive); - assert_eq!( - return_value.unproductive_cause, - UnproductiveContig::NOT_FULL_LEN | UnproductiveContig::SIZE_DELTA - ); + // // [NOT_FULL_LEN, SIZE_DELTA] + // let ann = [ + // (64, 340, 2, 11, 11), + // (416, 54, 3, 0, 4), + // (470, 211, 4, 0, 0), + // ]; + // let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + // assert!(!return_value.productive); + // assert_eq!( + // return_value.unproductive_cause, + // UnproductiveContig::NOT_FULL_LEN | UnproductiveContig::SIZE_DELTA + // ); - // [Misordered, NotFull, TooLarge] - let ann = [ - (416, 54, 3, 0, 4), - (64, 340, 2, 11, 11), - (470, 211, 4, 0, 0), - ]; - let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); - assert!(!return_value.productive); - assert_eq!( - return_value.unproductive_cause, - UnproductiveContig::NOT_FULL_LEN - | UnproductiveContig::SIZE_DELTA - | UnproductiveContig::MISORDERED - ); + // // [Misordered, NotFull, TooLarge] + // let ann = [ + // (416, 54, 3, 0, 4), + // (64, 340, 2, 11, 11), + // (470, 211, 4, 0, 0), + // ]; + // let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + // assert!(!return_value.productive); + // assert_eq!( + // return_value.unproductive_cause, + // UnproductiveContig::NOT_FULL_LEN + // | UnproductiveContig::SIZE_DELTA + // | UnproductiveContig::MISORDERED + // ); // Productive let b = DnaString::from_dna_string("GGACCAAAATTCAAAGACAAAATGCATTGTCAAGTGCAGATTTTCAGCTTCCTGCTAATCAGTGCCTCAGTCATAATGTCCAGAGGACAAATTGTTCTCACCCAGTCTCCAGCAATCATGTCTGCATCTCCAGGGGAGAAGGTCACCATAACCTGCAGTGCCAGCTCAAGTGTAAGTTACATGCACTGGTTCCAGCAGAAGCCAGGCACTTCTCCCAAACTCTGGATTTATAGCACATCCAACCTGGCTTCTGGAGTCCCTGCTCGCTTCAGTGGCAGTGGATCTGGGACCTCTTACTCTCTCACAATCAGCCGAATGGAGGCTGAAGATGCTGCCACTTATTACTGCCAGCAAAGGAGTAGTTACCCGCTCACGTTCGGTGCTGGGACCAAGCTGGAGCTGAAACGGGCTGATGCTGCACCAACTGTATCCATCTTCCCACCATCCAGTGAGCAGTTAACATCTGGAGGTGCCTCAGTCGTGTGCTTC"); let ann = [(21, 352, 5, 0, 3), (368, 38, 6, 0, 0), (406, 83, 7, 0, 0)]; let return_value = is_valid(&b, &refdata, &ann, false, &mut log, None); + println!("\n\n{:?}\n", return_value); assert!(return_value.productive); assert!(return_value.unproductive_cause.is_empty()); + assert!(1 == 2); } } From d35f98cdcead149bd316576e54a395b712fdd44c Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 30 Aug 2023 15:50:21 -0700 Subject: [PATCH 22/23] remove >>s --- vdj_ann/src/transcript.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 50ced22f..62077ac3 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -95,7 +95,7 @@ pub fn is_valid( if pass == 1 { m = "B"; } - println!(">> Pass: {:?}, m: {:?}", pass, m); + // println!(">> Pass: {:?}, m: {:?}", pass, m); let mut vstarts = Vec::::new(); let mut jstops = Vec::::new(); let mut first_vstart: i32 = -1; @@ -146,12 +146,12 @@ pub fn is_valid( } unique_sort(&mut vstarts); unique_sort(&mut jstops); - println!(">>>> vstarts: {:?}", vstarts); - println!(">>>> jstops: {:?}", jstops); + // println!(">>>> vstarts: {:?}", vstarts); + // println!(">>>> jstops: {:?}", jstops); let mut full = false; // 2 passes to check frameshifts (finding the start/stop codon) for inner_pass in 1..3 { - println!(">>>>>> inner_pass: {:?}", inner_pass); + // println!(">>>>>> inner_pass: {:?}", inner_pass); if inner_pass == 2 && full { continue; } From 16a96cf414d46815d43e96da561b2fa41631985b Mon Sep 17 00:00:00 2001 From: Nima Mousavi Date: Wed, 30 Aug 2023 16:22:14 -0700 Subject: [PATCH 23/23] loosen't productivity --- vdj_ann/src/transcript.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 62077ac3..93513321 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -84,7 +84,7 @@ pub fn is_valid( let mut missing_vstart = true; // let premature_stop = true; // let frameshift_mut = false; - // let missing_cdr3 = false; + let mut missing_cdr3 = false; // let too_large = true; // let misordered = false; let mut contig_status: UnproductiveContig = Default::default(); @@ -212,6 +212,7 @@ pub fn is_valid( fwriteln!(log, "did not find CDR3"); } contig_status |= UnproductiveContig::NO_CDR3; + missing_cdr3 = true; } let mut too_large = false; const MIN_DELTA: i32 = -25; @@ -261,7 +262,8 @@ pub fn is_valid( } contig_status |= UnproductiveContig::SIZE_DELTA; } - if full && !too_large && !misordered && contig_status.is_empty() { + if full && !too_large && !misordered && !missing_cdr3 { + // && contig_status.is_empty() { return ContigStatus { productive: true, unproductive_cause: contig_status,