From f5e4153a49af4c9aff5e9a030388e0cd1705c892 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Wed, 20 Nov 2024 16:36:07 -0800
Subject: [PATCH] working skipmers, i think!

---
 Cargo.lock                                    |   3 +-
 Cargo.toml                                    |   3 +-
 .../sourmash_plugin_branchwater/__init__.py   |   2 +-
 src/python/tests/test_sketch.py               | 130 ++++++++++++++++++
 src/utils/buildutils.rs                       |  25 +++-
 5 files changed, 156 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e52949a0..1e643c6f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1773,8 +1773,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7"
 [[package]]
 name = "sourmash"
 version = "0.17.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54e30f752d984b1d8456024973f8d89772b4ba248f592b77b57d59ad27a232a0"
+source = "git+https://github.com/sourmash-bio/sourmash/?branch=try-skipmers#d4a620094ba90a86dcd18197732f2bb18f7266f5"
 dependencies = [
  "az",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index a1c9b1f8..81a923e9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,8 @@ crate-type = ["cdylib"]
 pyo3 = { version = "0.23.1", features = ["extension-module", "anyhow"] }
 rayon = "1.10.0"
 serde = { version = "1.0.215", features = ["derive"] }
-sourmash = { version = "0.17.2", features = ["branchwater"] }
+#sourmash = { version = "0.17.2", features = ["branchwater"] }
+sourmash = {git = "https://github.com/sourmash-bio/sourmash/", branch = "try-skipmers", features = ["branchwater"]}
 serde_json = "1.0.133"
 niffler = "2.4.0"
 log = "0.4.22"
diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py
index 3de354aa..5eb8293b 100755
--- a/src/python/sourmash_plugin_branchwater/__init__.py
+++ b/src/python/sourmash_plugin_branchwater/__init__.py
@@ -618,7 +618,7 @@ def main(self, args):
             args.param_string = ["k=31,scaled=1000,dna"]
 
         # Check and append 'dna' if no moltype is found in a param string
-        moltypes = ["dna", "protein", "dayhoff", "hp"]
+        moltypes = ["dna", "protein", "dayhoff", "hp", "skipmer"]
         updated_param_strings = []
 
         for param in args.param_string:
diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py
index 86800008..13714565 100644
--- a/src/python/tests/test_sketch.py
+++ b/src/python/tests/test_sketch.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import csv
 import pandas
 import sourmash
 from sourmash import index
@@ -1435,3 +1436,132 @@ def test_singlesketch_zip_output(runtmp):
     runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
     sig2 = sourmash.load_one_signature(output2)
     assert sig.minhash.hashes == sig2.minhash.hashes
+
+
+def test_manysketch_skipmer(runtmp, capfd):
+    fa_csv = runtmp.output("db-fa.csv")
+
+    fa1 = get_test_data("short.fa")
+    fa2 = get_test_data("short2.fa")
+    fa3 = get_test_data("short3.fa")
+    protfa1 = get_test_data("short-protein.fa")
+
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
+
+    output = runtmp.output("db.zip")
+
+    runtmp.sourmash(
+        "scripts",
+        "manysketch",
+        fa_csv,
+        "-o",
+        output,
+        "--param-str",
+        "dna,k=21,scaled=1",
+        "--param-str",
+        "skipmer,k=31,scaled=30",
+    )
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out  # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.err)
+
+    idx = sourmash.load_file_as_index(output)
+    sigs = list(idx.signatures())
+    print(sigs)
+
+    assert len(sigs) == 3  # 3 dna, 3 skipmer. But sourmash can only read the DNA sigs!!
+    # check moltypes, etc!
+    dna_md5sums = {
+        "short": "1474578c5c46dd09da4c2df29cf86621",
+        "short2": "4efeebd26644278e36b9553e018a851a",
+        "short3": "f85747ac4f473c4a71c1740d009f512b",
+    }
+    for sig in sigs:
+        if sig.minhash.is_dna:
+            assert sig.minhash.ksize == 21
+            assert sig.minhash.scaled == 1
+            print("DNA: ", sig.name, sig.md5sum())
+            assert sig.md5sum() == dna_md5sums[sig.name]
+
+    # read the file with python and check sigs
+    import zipfile, gzip, json
+
+    with zipfile.ZipFile(output, "r") as zf:
+        # Check the manifest exists
+        assert "SOURMASH-MANIFEST.csv" in zf.namelist()
+
+        expected_signatures = [
+            {
+                "name": "short",
+                "ksize": 31,
+                "scaled": 30,
+                "moltype": "skipmer",
+                "md5sum": "c7b5b8d98c6fde00e2a25769d5eb470a",
+            },
+            {
+                "name": "short3",
+                "ksize": 31,
+                "scaled": 30,
+                "moltype": "skipmer",
+                "md5sum": "ed5bbe1b2a9a5cd83a6c8583e8122518",
+            },
+            {
+                "name": "short2",
+                "ksize": 31,
+                "scaled": 30,
+                "moltype": "skipmer",
+                "md5sum": "13c456b9a346284d0acdf49b3e529da6",
+            },
+        ]
+        expected_signatures_dict = {exp["md5sum"]: exp for exp in expected_signatures}
+
+        # Read and parse the manifest
+        with zf.open("SOURMASH-MANIFEST.csv") as manifest_file:
+            manifest_data = manifest_file.read().decode("utf-8").splitlines()
+            manifest_data = [line for line in manifest_data if not line.startswith("#")]
+            manifest_reader = csv.DictReader(manifest_data)
+
+            for row in manifest_reader:
+                if row["moltype"] == "skipmer":
+                    print("Manifest Row:", row)
+
+                    # Validate row fields
+                    md5sum = row["md5"]
+                    assert (
+                        md5sum in expected_signatures_dict
+                    ), f"Unexpected md5sum: {md5sum}"
+                    expected = expected_signatures_dict[md5sum]
+                    assert (
+                        row["name"] == expected["name"]
+                    ), f"Name mismatch: {row['name']}"
+                    assert (
+                        int(row["ksize"]) == expected["ksize"]
+                    ), f"Ksize mismatch: {row['ksize']}"
+                    assert (
+                        row["moltype"] == expected["moltype"]
+                    ), f"Moltype mismatch: {row['moltype']}"
+
+                    sig_path = row["internal_location"]
+                    assert sig_path.startswith("signatures/")
+
+                    # Extract and read the signature file
+                    with zf.open(sig_path) as sig_gz:
+                        with gzip.open(sig_gz, "rt") as sig_file:
+                            sig_contents = json.load(sig_file)
+                            print("Signature Contents:", sig_contents)
+
+                            # Validate signature contents
+                            sig_data = sig_contents[0]
+                            print(sig_data)
+                            siginfo = sig_data["signatures"][0]
+                            assert (
+                                siginfo["md5sum"] == md5sum
+                            ), f"MD5 mismatch: {siginfo['md5sum']}"
+                            assert (
+                                siginfo["ksize"] == expected["ksize"]
+                            ), f"Ksize mismatch: {siginfo['ksize']}"
+                            assert (
+                                siginfo["molecule"] == expected["moltype"]
+                            ), f"Moltype mismatch: {siginfo['molecule']}"
diff --git a/src/utils/buildutils.rs b/src/utils/buildutils.rs
index 8d75391c..95e1b77e 100644
--- a/src/utils/buildutils.rs
+++ b/src/utils/buildutils.rs
@@ -49,7 +49,7 @@ impl MultiSelection {
     pub fn from_input_moltype(input_moltype: &str) -> Result<Self, SourmashError> {
         // currently we don't allow translation. Will need to change this when we do.
         // is there a better way to do this?
-        let mut moltypes = vec!["DNA"];
+        let mut moltypes = vec!["DNA", "skipmer"]; // change so default is just dna?
         if input_moltype == "protein" {
             moltypes = vec!["protein", "dayhoff", "hp"];
         }
@@ -181,6 +181,15 @@ impl BuildRecord {
         }
     }
 
+    pub fn default_skipmer() -> Self {
+        Self {
+            moltype: "skipmer".to_string(),
+            ksize: 21,
+            scaled: 1000,
+            ..Self::default_dna()
+        }
+    }
+
     pub fn moltype(&self) -> HashFunctions {
         self.moltype.as_str().try_into().unwrap()
     }
@@ -420,6 +429,13 @@ impl BuildCollection {
         Ok(mf.records.len())
     }
 
+    pub fn skipmer_size(&self) -> Result<usize, SourmashError> {
+        let multiselection = MultiSelection::from_moltypes(vec!["skipmer"])?;
+        let mut mf = self.manifest.clone(); // temporary mutable copy
+        mf.select(&multiselection)?;
+        Ok(mf.records.len())
+    }
+
     pub fn protein_size(&self) -> Result<usize, SourmashError> {
         let multiselection = MultiSelection::from_moltypes(vec!["protein"])?;
         let mut mf = self.manifest.clone(); // temporary mutable copy
@@ -466,7 +482,7 @@ impl BuildCollection {
 
     pub fn parse_moltype(item: &str, current: &mut Option<String>) -> Result<String, String> {
         let new_moltype = match item {
-            "protein" | "dna" | "dayhoff" | "hp" => item.to_string(),
+            "protein" | "dna" | "dayhoff" | "hp" | "skipmer" => item.to_string(),
             _ => return Err(format!("unknown moltype '{}'", item)),
         };
 
@@ -531,7 +547,7 @@ impl BuildCollection {
                 "abund" | "noabund" => {
                     Self::parse_abundance(item, &mut track_abundance)?;
                 }
-                "protein" | "dna" | "DNA" | "dayhoff" | "hp" => {
+                "protein" | "dna" | "DNA" | "dayhoff" | "hp" | "skipmer" => {
                     Self::parse_moltype(item, &mut moltype)?;
                 }
                 _ if item.starts_with("num=") => {
@@ -566,6 +582,7 @@ impl BuildCollection {
             "protein" => BuildRecord::default_protein(),
             "dayhoff" => BuildRecord::default_dayhoff(),
             "hp" => BuildRecord::default_hp(),
+            "skipmer" => BuildRecord::default_skipmer(),
             _ => {
                 return Err(format!(
                     "Error parsing params string '{}': Unsupported moltype '{}'",
@@ -660,6 +677,7 @@ impl BuildCollection {
             .dna(record.moltype == "DNA")
             .dayhoff(record.moltype == "dayhoff")
             .hp(record.moltype == "hp")
+            .skipmer(record.moltype == "skipmer")
             .num_hashes(record.num)
             .track_abundance(record.with_abundance)
             .build();
@@ -748,6 +766,7 @@ impl BuildCollection {
                 }
             } else if (input_moltype == "DNA" || input_moltype == "dna")
                 && rec.moltype() == HashFunctions::Murmur64Dna
+                || rec.moltype() == HashFunctions::Murmur64Skipmer
             {
                 sig.add_sequence(&record.seq(), true)
                     .context("Failed to add sequence")?;