From f5e4153a49af4c9aff5e9a030388e0cd1705c892 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 20 Nov 2024 16:36:07 -0800 Subject: [PATCH] working skipmers, i think! --- Cargo.lock | 3 +- Cargo.toml | 3 +- .../sourmash_plugin_branchwater/__init__.py | 2 +- src/python/tests/test_sketch.py | 130 ++++++++++++++++++ src/utils/buildutils.rs | 25 +++- 5 files changed, 156 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e52949a0..1e643c6f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1773,8 +1773,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.17.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e30f752d984b1d8456024973f8d89772b4ba248f592b77b57d59ad27a232a0" +source = "git+https://github.com/sourmash-bio/sourmash/?branch=try-skipmers#d4a620094ba90a86dcd18197732f2bb18f7266f5" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index a1c9b1f8..81a923e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.23.1", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.215", features = ["derive"] } -sourmash = { version = "0.17.2", features = ["branchwater"] } +#sourmash = { version = "0.17.2", features = ["branchwater"] } +sourmash = {git = "https://github.com/sourmash-bio/sourmash/", branch = "try-skipmers", features = ["branchwater"]} serde_json = "1.0.133" niffler = "2.4.0" log = "0.4.22" diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 3de354aa..5eb8293b 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -618,7 +618,7 @@ def main(self, args): args.param_string = ["k=31,scaled=1000,dna"] # Check and append 'dna' if no moltype is found in a param string - moltypes = ["dna", "protein", "dayhoff", "hp"] + moltypes = ["dna", "protein", "dayhoff", "hp", "skipmer"] updated_param_strings = [] for param in args.param_string: diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index 86800008..13714565 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -1,5 +1,6 @@ import os import pytest +import csv import pandas import sourmash from sourmash import index @@ -1435,3 +1436,132 @@ def test_singlesketch_zip_output(runtmp): runtmp.sourmash("sketch", "dna", fa1, "-o", output2) sig2 = sourmash.load_one_signature(output2) assert sig.minhash.hashes == sig2.minhash.hashes + + +def test_manysketch_skipmer(runtmp, capfd): + fa_csv = runtmp.output("db-fa.csv") + + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + protfa1 = get_test_data("short-protein.fa") + + make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) + + output = runtmp.output("db.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=21,scaled=1", + "--param-str", + "skipmer,k=31,scaled=30", + ) + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + captured = capfd.readouterr() + print(captured.err) + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 3 # 3 dna, 3 skipmer. But sourmash can only read the DNA sigs!! + # check moltypes, etc! + dna_md5sums = { + "short": "1474578c5c46dd09da4c2df29cf86621", + "short2": "4efeebd26644278e36b9553e018a851a", + "short3": "f85747ac4f473c4a71c1740d009f512b", + } + for sig in sigs: + if sig.minhash.is_dna: + assert sig.minhash.ksize == 21 + assert sig.minhash.scaled == 1 + print("DNA: ", sig.name, sig.md5sum()) + assert sig.md5sum() == dna_md5sums[sig.name] + + # read the file with python and check sigs + import zipfile, gzip, json + + with zipfile.ZipFile(output, "r") as zf: + # Check the manifest exists + assert "SOURMASH-MANIFEST.csv" in zf.namelist() + + expected_signatures = [ + { + "name": "short", + "ksize": 31, + "scaled": 30, + "moltype": "skipmer", + "md5sum": "c7b5b8d98c6fde00e2a25769d5eb470a", + }, + { + "name": "short3", + "ksize": 31, + "scaled": 30, + "moltype": "skipmer", + "md5sum": "ed5bbe1b2a9a5cd83a6c8583e8122518", + }, + { + "name": "short2", + "ksize": 31, + "scaled": 30, + "moltype": "skipmer", + "md5sum": "13c456b9a346284d0acdf49b3e529da6", + }, + ] + expected_signatures_dict = {exp["md5sum"]: exp for exp in expected_signatures} + + # Read and parse the manifest + with zf.open("SOURMASH-MANIFEST.csv") as manifest_file: + manifest_data = manifest_file.read().decode("utf-8").splitlines() + manifest_data = [line for line in manifest_data if not line.startswith("#")] + manifest_reader = csv.DictReader(manifest_data) + + for row in manifest_reader: + if row["moltype"] == "skipmer": + print("Manifest Row:", row) + + # Validate row fields + md5sum = row["md5"] + assert ( + md5sum in expected_signatures_dict + ), f"Unexpected md5sum: {md5sum}" + expected = expected_signatures_dict[md5sum] + assert ( + row["name"] == expected["name"] + ), f"Name mismatch: {row['name']}" + assert ( + int(row["ksize"]) == expected["ksize"] + ), f"Ksize mismatch: {row['ksize']}" + assert ( + row["moltype"] == expected["moltype"] + ), f"Moltype mismatch: {row['moltype']}" + + sig_path = row["internal_location"] + assert sig_path.startswith("signatures/") + + # Extract and read the signature file + with zf.open(sig_path) as sig_gz: + with gzip.open(sig_gz, "rt") as sig_file: + sig_contents = json.load(sig_file) + print("Signature Contents:", sig_contents) + + # Validate signature contents + sig_data = sig_contents[0] + print(sig_data) + siginfo = sig_data["signatures"][0] + assert ( + siginfo["md5sum"] == md5sum + ), f"MD5 mismatch: {siginfo['md5sum']}" + assert ( + siginfo["ksize"] == expected["ksize"] + ), f"Ksize mismatch: {siginfo['ksize']}" + assert ( + siginfo["molecule"] == expected["moltype"] + ), f"Moltype mismatch: {siginfo['molecule']}" diff --git a/src/utils/buildutils.rs b/src/utils/buildutils.rs index 8d75391c..95e1b77e 100644 --- a/src/utils/buildutils.rs +++ b/src/utils/buildutils.rs @@ -49,7 +49,7 @@ impl MultiSelection { pub fn from_input_moltype(input_moltype: &str) -> Result { // currently we don't allow translation. Will need to change this when we do. // is there a better way to do this? - let mut moltypes = vec!["DNA"]; + let mut moltypes = vec!["DNA", "skipmer"]; // change so default is just dna? if input_moltype == "protein" { moltypes = vec!["protein", "dayhoff", "hp"]; } @@ -181,6 +181,15 @@ impl BuildRecord { } } + pub fn default_skipmer() -> Self { + Self { + moltype: "skipmer".to_string(), + ksize: 21, + scaled: 1000, + ..Self::default_dna() + } + } + pub fn moltype(&self) -> HashFunctions { self.moltype.as_str().try_into().unwrap() } @@ -420,6 +429,13 @@ impl BuildCollection { Ok(mf.records.len()) } + pub fn skipmer_size(&self) -> Result { + let multiselection = MultiSelection::from_moltypes(vec!["skipmer"])?; + let mut mf = self.manifest.clone(); // temporary mutable copy + mf.select(&multiselection)?; + Ok(mf.records.len()) + } + pub fn protein_size(&self) -> Result { let multiselection = MultiSelection::from_moltypes(vec!["protein"])?; let mut mf = self.manifest.clone(); // temporary mutable copy @@ -466,7 +482,7 @@ impl BuildCollection { pub fn parse_moltype(item: &str, current: &mut Option) -> Result { let new_moltype = match item { - "protein" | "dna" | "dayhoff" | "hp" => item.to_string(), + "protein" | "dna" | "dayhoff" | "hp" | "skipmer" => item.to_string(), _ => return Err(format!("unknown moltype '{}'", item)), }; @@ -531,7 +547,7 @@ impl BuildCollection { "abund" | "noabund" => { Self::parse_abundance(item, &mut track_abundance)?; } - "protein" | "dna" | "DNA" | "dayhoff" | "hp" => { + "protein" | "dna" | "DNA" | "dayhoff" | "hp" | "skipmer" => { Self::parse_moltype(item, &mut moltype)?; } _ if item.starts_with("num=") => { @@ -566,6 +582,7 @@ impl BuildCollection { "protein" => BuildRecord::default_protein(), "dayhoff" => BuildRecord::default_dayhoff(), "hp" => BuildRecord::default_hp(), + "skipmer" => BuildRecord::default_skipmer(), _ => { return Err(format!( "Error parsing params string '{}': Unsupported moltype '{}'", @@ -660,6 +677,7 @@ impl BuildCollection { .dna(record.moltype == "DNA") .dayhoff(record.moltype == "dayhoff") .hp(record.moltype == "hp") + .skipmer(record.moltype == "skipmer") .num_hashes(record.num) .track_abundance(record.with_abundance) .build(); @@ -748,6 +766,7 @@ impl BuildCollection { } } else if (input_moltype == "DNA" || input_moltype == "dna") && rec.moltype() == HashFunctions::Murmur64Dna + || rec.moltype() == HashFunctions::Murmur64Skipmer { sig.add_sequence(&record.seq(), true) .context("Failed to add sequence")?;