Skip to content

Commit

Permalink
enable dayhoff, hp sketching
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Jun 13, 2024
1 parent c25ade1 commit 8bb6525
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 10 deletions.
1 change: 1 addition & 0 deletions src/directsketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,7 @@ pub async fn gbsketch(
}
};
let dna_sig_templates = build_siginfo(&params_vec, "DNA");
// prot will build protein, dayhoff, hp
let prot_sig_templates = build_siginfo(&params_vec, "protein");

let mut genomes_only = genomes_only;
Expand Down
32 changes: 23 additions & 9 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ impl InputMolType {}
impl fmt::Display for InputMolType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
InputMolType::Dna => write!(f, "dna"),
InputMolType::Dna => write!(f, "DNA"),
InputMolType::Protein => write!(f, "protein"),
}
}
Expand Down Expand Up @@ -280,6 +280,8 @@ pub struct Params {
pub scaled: u64,
pub seed: u32,
pub is_protein: bool,
pub is_dayhoff: bool,
pub is_hp: bool,
pub is_dna: bool,
}

Expand All @@ -291,6 +293,8 @@ impl Hash for Params {
self.scaled.hash(state);
self.seed.hash(state);
self.is_protein.hash(state);
self.is_dayhoff.hash(state);
self.is_hp.hash(state);
self.is_dna.hash(state);
}
}
Expand All @@ -308,7 +312,9 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
let mut scaled = 1000;
let mut seed = 42;
let mut is_protein = false;
let mut is_dna = true;
let mut is_dayhoff = false;
let mut is_hp = false;
let mut is_dna = false;

for item in items.iter() {
match *item {
Expand Down Expand Up @@ -337,12 +343,16 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
}
"protein" => {
is_protein = true;
is_dna = false;
}
"dna" => {
is_protein = false;
is_dna = true;
}
"dayhoff" => {
is_dayhoff = true;
}
"hp" => {
is_hp = true;
}
_ => return Err(format!("unknown component '{}' in params string", item)),
}
}
Expand All @@ -356,6 +366,8 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
seed,
is_protein,
is_dna,
is_dayhoff,
is_hp,
};
unique_params.insert(param);
}
Expand All @@ -364,19 +376,19 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
Ok(unique_params.into_iter().collect())
}

pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec<Signature> {
pub fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec<Signature> {
let mut sigs = Vec::new();

for param in params.iter().cloned() {
match moltype {
// if dna, only build dna sigs. if protein, only build protein sigs
match input_moltype {
// if dna, only build dna sigs. if protein, only build protein sigs, etc
"dna" | "DNA" if !param.is_dna => continue,
"protein" if !param.is_protein => continue,
"protein" if !param.is_protein && !param.is_dayhoff && !param.is_hp => continue,
_ => (),
}

// Adjust ksize value based on the is_protein flag
let adjusted_ksize = if param.is_protein {
let adjusted_ksize = if param.is_protein || param.is_dayhoff || param.is_hp {
param.ksize * 3
} else {
param.ksize
Expand All @@ -387,6 +399,8 @@ pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec<Signature> {
.scaled(param.scaled)
.protein(param.is_protein)
.dna(param.is_dna)
.dayhoff(param.is_dayhoff)
.hp(param.is_hp)
.num_hashes(param.num)
.track_abundance(param.track_abundance)
.build();
Expand Down
Binary file added tests/test-data/GCA_000961135.2.dayhoff.sig.gz
Binary file not shown.
Binary file added tests/test-data/GCA_000961135.2.hp.sig.gz
Binary file not shown.
47 changes: 47 additions & 0 deletions tests/test_gbsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,3 +460,50 @@ def test_gbsketch_missing_output(runtmp):
'--param-str', "dna,k=31,scaled=1000")

assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err


def test_gbsketch_protein_dayhoff_hp(runtmp):
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')

sig1 = get_test_data('GCA_000961135.2.protein.sig.gz')
sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz')
sig3 = get_test_data('GCA_000961135.2.hp.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein')
ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff')
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp')

runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1',
'--param-str',"protein,k=10,scaled=200",
'-p', "dayhoff,k=10,scaled=200",
'-p', "hp,k=10,scaled=200")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())

assert len(sigs) == 3
for sig in sigs:
assert sig.name == ss1.name
if sig.minhash.moltype == 'protein':
assert sig.md5sum() == ss1.md5sum()
elif sig.minhash.moltype == 'dayhoff':
assert sig.md5sum() == ss2.md5sum()
elif sig.minhash.moltype == 'hp':
assert sig.md5sum() == ss3.md5sum()
assert os.path.exists(failed)
with open(failed, 'r') as failF:
fail_lines = failF.readlines()
print(fail_lines)
assert len(fail_lines) == 2
assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n"
acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',')
assert acc == "GCA_000175535.1"
assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14"
assert moltype == "protein"
assert download_filename == "GCA_000175535.1_protein.faa.gz"
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"
40 changes: 39 additions & 1 deletion tests/test_urlsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,4 +318,42 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd):
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"


# def test_urlsketch_from_urlsketch_failed(runtmp, capfd):
def test_gbsketch_protein_dayhoff_hp(runtmp):
acc_csv = get_test_data('acc-url.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')

sig1 = get_test_data('GCA_000961135.2.protein.sig.gz')
sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz')
sig3 = get_test_data('GCA_000961135.2.hp.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein')
ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff')
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp')

runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1',
'--param-str',"protein,k=10,scaled=200",
'-p', "dayhoff,k=10,scaled=200",
'-p', "hp,k=10,scaled=200")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())

assert len(sigs) == 3
for sig in sigs:
assert sig.name == ss1.name
if sig.minhash.moltype == 'protein':
assert sig.md5sum() == ss1.md5sum()
elif sig.minhash.moltype == 'dayhoff':
assert sig.md5sum() == ss2.md5sum()
elif sig.minhash.moltype == 'hp':
assert sig.md5sum() == ss3.md5sum()
assert os.path.exists(failed)
with open(failed, 'r') as failF:
fail_lines = failF.readlines()
print(fail_lines)
assert len(fail_lines) == 1
assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n"

0 comments on commit 8bb6525

Please sign in to comment.