Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: enable dayhoff, hp sketching #55

Merged
merged 7 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/directsketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,7 @@ pub async fn gbsketch(
}
};
let dna_sig_templates = build_siginfo(&params_vec, "DNA");
// prot will build protein, dayhoff, hp
let prot_sig_templates = build_siginfo(&params_vec, "protein");

let mut genomes_only = genomes_only;
Expand Down
32 changes: 23 additions & 9 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ impl InputMolType {}
impl fmt::Display for InputMolType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
InputMolType::Dna => write!(f, "dna"),
InputMolType::Dna => write!(f, "DNA"),
InputMolType::Protein => write!(f, "protein"),
}
}
Expand Down Expand Up @@ -280,6 +280,8 @@ pub struct Params {
pub scaled: u64,
pub seed: u32,
pub is_protein: bool,
pub is_dayhoff: bool,
pub is_hp: bool,
pub is_dna: bool,
}

Expand All @@ -291,6 +293,8 @@ impl Hash for Params {
self.scaled.hash(state);
self.seed.hash(state);
self.is_protein.hash(state);
self.is_dayhoff.hash(state);
self.is_hp.hash(state);
self.is_dna.hash(state);
}
}
Expand All @@ -308,7 +312,9 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
let mut scaled = 1000;
let mut seed = 42;
let mut is_protein = false;
let mut is_dna = true;
let mut is_dayhoff = false;
let mut is_hp = false;
let mut is_dna = false;

for item in items.iter() {
match *item {
Expand Down Expand Up @@ -337,12 +343,16 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
}
"protein" => {
is_protein = true;
is_dna = false;
}
"dna" => {
is_protein = false;
is_dna = true;
}
"dayhoff" => {
is_dayhoff = true;
}
"hp" => {
is_hp = true;
}
_ => return Err(format!("unknown component '{}' in params string", item)),
}
}
Expand All @@ -356,6 +366,8 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
seed,
is_protein,
is_dna,
is_dayhoff,
is_hp,
};
unique_params.insert(param);
}
Expand All @@ -364,19 +376,19 @@ pub fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
Ok(unique_params.into_iter().collect())
}

pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec<Signature> {
pub fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec<Signature> {
let mut sigs = Vec::new();

for param in params.iter().cloned() {
match moltype {
// if dna, only build dna sigs. if protein, only build protein sigs
match input_moltype {
// if dna, only build dna sigs. if protein, only build protein sigs, etc
"dna" | "DNA" if !param.is_dna => continue,
"protein" if !param.is_protein => continue,
"protein" if !param.is_protein && !param.is_dayhoff && !param.is_hp => continue,
_ => (),
}

// Adjust ksize value based on the is_protein flag
let adjusted_ksize = if param.is_protein {
let adjusted_ksize = if param.is_protein || param.is_dayhoff || param.is_hp {
param.ksize * 3
} else {
param.ksize
Expand All @@ -387,6 +399,8 @@ pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec<Signature> {
.scaled(param.scaled)
.protein(param.is_protein)
.dna(param.is_dna)
.dayhoff(param.is_dayhoff)
.hp(param.is_hp)
.num_hashes(param.num)
.track_abundance(param.track_abundance)
.build();
Expand Down
Binary file added tests/test-data/GCA_000961135.2.dayhoff.sig.gz
Binary file not shown.
Binary file added tests/test-data/GCA_000961135.2.hp.sig.gz
Binary file not shown.
48 changes: 48 additions & 0 deletions tests/test_gbsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,3 +494,51 @@ def test_zip_file_permissions(runtmp):
print(f"File: {zip_info.filename}, Permissions: {permissions}")
# check permissions are 644 (rw-r--r--)
assert external_attr == 0o644


def test_gbsketch_protein_dayhoff_hp(runtmp):
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')

sig1 = get_test_data('GCA_000961135.2.protein.sig.gz')
sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz')
sig3 = get_test_data('GCA_000961135.2.hp.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein')
ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff')
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp')

runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1',
'--param-str',"protein,k=10,scaled=200",
'-p', "dayhoff,k=10,scaled=200",
'-p', "hp,k=10,scaled=200")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())

assert len(sigs) == 3
for sig in sigs:
assert sig.name == ss1.name
if sig.minhash.moltype == 'protein':
assert sig.md5sum() == ss1.md5sum()
elif sig.minhash.moltype == 'dayhoff':
assert sig.md5sum() == ss2.md5sum()
elif sig.minhash.moltype == 'hp':
assert sig.md5sum() == ss3.md5sum()
assert os.path.exists(failed)
with open(failed, 'r') as failF:
fail_lines = failF.readlines()
print(fail_lines)
assert len(fail_lines) == 2
assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n"
acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',')
assert acc == "GCA_000175535.1"
assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14"
assert moltype == "protein"
assert download_filename == "GCA_000175535.1_protein.faa.gz"
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"

41 changes: 41 additions & 0 deletions tests/test_urlsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,44 @@ def test_zip_file_permissions(runtmp):
# check permissions are 644 (rw-r--r--)
assert external_attr == 0o644


def test_gbsketch_protein_dayhoff_hp(runtmp):
acc_csv = get_test_data('acc-url.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')

sig1 = get_test_data('GCA_000961135.2.protein.sig.gz')
sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz')
sig3 = get_test_data('GCA_000961135.2.hp.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein')
ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff')
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp')

runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1',
'--param-str',"protein,k=10,scaled=200",
'-p', "dayhoff,k=10,scaled=200",
'-p', "hp,k=10,scaled=200")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())

assert len(sigs) == 3
for sig in sigs:
assert sig.name == ss1.name
if sig.minhash.moltype == 'protein':
assert sig.md5sum() == ss1.md5sum()
elif sig.minhash.moltype == 'dayhoff':
assert sig.md5sum() == ss2.md5sum()
elif sig.minhash.moltype == 'hp':
assert sig.md5sum() == ss3.md5sum()
assert os.path.exists(failed)
with open(failed, 'r') as failF:
fail_lines = failF.readlines()
print(fail_lines)
assert len(fail_lines) == 1
assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n"