diff --git a/src/directsketch.rs b/src/directsketch.rs index 34915a0..f3f50a5 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -734,6 +734,7 @@ pub async fn gbsketch( } }; let dna_sig_templates = build_siginfo(¶ms_vec, "DNA"); + // prot will build protein, dayhoff, hp let prot_sig_templates = build_siginfo(¶ms_vec, "protein"); let mut genomes_only = genomes_only; diff --git a/src/utils.rs b/src/utils.rs index ab91ac7..5b5df70 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,7 +17,7 @@ impl InputMolType {} impl fmt::Display for InputMolType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - InputMolType::Dna => write!(f, "dna"), + InputMolType::Dna => write!(f, "DNA"), InputMolType::Protein => write!(f, "protein"), } } @@ -280,6 +280,8 @@ pub struct Params { pub scaled: u64, pub seed: u32, pub is_protein: bool, + pub is_dayhoff: bool, + pub is_hp: bool, pub is_dna: bool, } @@ -291,6 +293,8 @@ impl Hash for Params { self.scaled.hash(state); self.seed.hash(state); self.is_protein.hash(state); + self.is_dayhoff.hash(state); + self.is_hp.hash(state); self.is_dna.hash(state); } } @@ -308,7 +312,9 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { let mut scaled = 1000; let mut seed = 42; let mut is_protein = false; - let mut is_dna = true; + let mut is_dayhoff = false; + let mut is_hp = false; + let mut is_dna = false; for item in items.iter() { match *item { @@ -337,12 +343,16 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { } "protein" => { is_protein = true; - is_dna = false; } "dna" => { - is_protein = false; is_dna = true; } + "dayhoff" => { + is_dayhoff = true; + } + "hp" => { + is_hp = true; + } _ => return Err(format!("unknown component '{}' in params string", item)), } } @@ -356,6 +366,8 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { seed, is_protein, is_dna, + is_dayhoff, + is_hp, }; unique_params.insert(param); } @@ -364,19 +376,19 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { Ok(unique_params.into_iter().collect()) } -pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec { +pub fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec { let mut sigs = Vec::new(); for param in params.iter().cloned() { - match moltype { - // if dna, only build dna sigs. if protein, only build protein sigs + match input_moltype { + // if dna, only build dna sigs. if protein, only build protein sigs, etc "dna" | "DNA" if !param.is_dna => continue, - "protein" if !param.is_protein => continue, + "protein" if !param.is_protein && !param.is_dayhoff && !param.is_hp => continue, _ => (), } // Adjust ksize value based on the is_protein flag - let adjusted_ksize = if param.is_protein { + let adjusted_ksize = if param.is_protein || param.is_dayhoff || param.is_hp { param.ksize * 3 } else { param.ksize @@ -387,6 +399,8 @@ pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec { .scaled(param.scaled) .protein(param.is_protein) .dna(param.is_dna) + .dayhoff(param.is_dayhoff) + .hp(param.is_hp) .num_hashes(param.num) .track_abundance(param.track_abundance) .build(); diff --git a/tests/test-data/GCA_000961135.2.dayhoff.sig.gz b/tests/test-data/GCA_000961135.2.dayhoff.sig.gz new file mode 100644 index 0000000..5638011 Binary files /dev/null and b/tests/test-data/GCA_000961135.2.dayhoff.sig.gz differ diff --git a/tests/test-data/GCA_000961135.2.hp.sig.gz b/tests/test-data/GCA_000961135.2.hp.sig.gz new file mode 100644 index 0000000..40cbabc Binary files /dev/null and b/tests/test-data/GCA_000961135.2.hp.sig.gz differ diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index 8c1fd92..9ea7578 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -494,3 +494,51 @@ def test_zip_file_permissions(runtmp): print(f"File: {zip_info.filename}, Permissions: {permissions}") # check permissions are 644 (rw-r--r--) assert external_attr == 0o644 + + +def test_gbsketch_protein_dayhoff_hp(runtmp): + acc_csv = get_test_data('acc.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000961135.2.protein.sig.gz') + sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz') + sig3 = get_test_data('GCA_000961135.2.hp.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein') + ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff') + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp') + + runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str',"protein,k=10,scaled=200", + '-p', "dayhoff,k=10,scaled=200", + '-p', "hp,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + assert sig.name == ss1.name + if sig.minhash.moltype == 'protein': + assert sig.md5sum() == ss1.md5sum() + elif sig.minhash.moltype == 'dayhoff': + assert sig.md5sum() == ss2.md5sum() + elif sig.minhash.moltype == 'hp': + assert sig.md5sum() == ss3.md5sum() + assert os.path.exists(failed) + with open(failed, 'r') as failF: + fail_lines = failF.readlines() + print(fail_lines) + assert len(fail_lines) == 2 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 6967849..edfe92d 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -351,3 +351,44 @@ def test_zip_file_permissions(runtmp): # check permissions are 644 (rw-r--r--) assert external_attr == 0o644 + +def test_gbsketch_protein_dayhoff_hp(runtmp): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000961135.2.protein.sig.gz') + sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz') + sig3 = get_test_data('GCA_000961135.2.hp.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein') + ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff') + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str',"protein,k=10,scaled=200", + '-p', "dayhoff,k=10,scaled=200", + '-p', "hp,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + assert sig.name == ss1.name + if sig.minhash.moltype == 'protein': + assert sig.md5sum() == ss1.md5sum() + elif sig.minhash.moltype == 'dayhoff': + assert sig.md5sum() == ss2.md5sum() + elif sig.minhash.moltype == 'hp': + assert sig.md5sum() == ss3.md5sum() + assert os.path.exists(failed) + with open(failed, 'r') as failF: + fail_lines = failF.readlines() + print(fail_lines) + assert len(fail_lines) == 1 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" +