MRG: add capacity for skipmer sketching and search (#531)

* working skipmers, i think! * test singlesketch skipmers * add skipmer as moltype option for other commands * black formatting * add skipmer test sigs; add fastgather test * upd * upd test files * upd * fix * fix2 * try not pinning * MRG: switch to `importlib.metadata` from `pkg_resources` (#546) * switch from pkg_resources to importlib.metadata * black * need parens for proper selection * add anydna_size * update to sourmash prospective v0.18.0 * remove tests for bad zip files 😅 * upd to use py-skipmers * switch to sourmash latest; add skipmer indexing test * black formatting * upd tests * test for intersect_manifest bug * fix lint * fmt python * cargo fmt * remove redundant test * bump to sourmash r0.18.0 --------- Co-authored-by: C. Titus Brown <[email protected]>
sourmash-bio · Dec 21, 2024 · 6a3c49a · 6a3c49a
1 parent 992bf48
commit 6a3c49a
Show file tree

Hide file tree

Showing 9 changed files with 349 additions and 18 deletions.
diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py
@@ -78,8 +78,8 @@ def __init__(self, p):
             "-m",
             "--moltype",
             default="DNA",
-            choices=["DNA", "protein", "dayhoff", "hp"],
-            help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
+            choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
+            help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
         )
         p.add_argument(
             "-c",
@@ -187,8 +187,8 @@ def __init__(self, p):
             "-m",
             "--moltype",
             default="DNA",
-            choices=["DNA", "protein", "dayhoff", "hp"],
-            help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
+            choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
+            help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
         )
         p.add_argument(
             "-c",
@@ -264,8 +264,8 @@ def __init__(self, p):
             "-m",
             "--moltype",
             default="DNA",
-            choices=["DNA", "protein", "dayhoff", "hp"],
-            help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
+            choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
+            help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
         )
         p.add_argument(
             "-c",
@@ -348,8 +348,8 @@ def __init__(self, p):
             "-m",
             "--moltype",
             default="DNA",
-            choices=["DNA", "protein", "dayhoff", "hp"],
-            help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
+            choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
+            help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
         )
         p.add_argument(
             "-c",
@@ -450,8 +450,8 @@ def __init__(self, p):
             "-m",
             "--moltype",
             default="DNA",
-            choices=["DNA", "protein", "dayhoff", "hp"],
-            help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
+            choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
+            help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
         )
         p.add_argument(
             "-c",
@@ -544,8 +544,8 @@ def __init__(self, p):
             "-m",
             "--moltype",
             default="DNA",
-            choices=["DNA", "protein", "dayhoff", "hp"],
-            help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
+            choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
+            help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
         )
         p.add_argument(
             "-c",
@@ -640,7 +640,7 @@ def main(self, args):
             args.param_string = ["k=31,scaled=1000,dna"]
 
         # Check and append 'dna' if no moltype is found in a param string
-        moltypes = ["dna", "protein", "dayhoff", "hp"]
+        moltypes = ["dna", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"]
         updated_param_strings = []
 
         for param in args.param_string:

diff --git a/src/python/tests/test-data/SRR606249.skipm2n3.zip b/src/python/tests/test-data/SRR606249.skipm2n3.zip
diff --git a/src/python/tests/test-data/skipm1n3.zip b/src/python/tests/test-data/skipm1n3.zip
diff --git a/src/python/tests/test-data/skipm2n3.zip b/src/python/tests/test-data/skipm2n3.zip
diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py
@@ -1328,3 +1328,66 @@ def test_equal_matches(runtmp):
     df = pandas.read_csv(runtmp.output("out.csv"))
     assert len(df) == 2
     assert set(df["intersect_bp"]) == {1000}
+
+
+def test_simple_skipm2n3(
+    runtmp, capfd, indexed_query, indexed_against, toggle_internal_storage
+):
+    # test basic execution!
+    query = get_test_data("SRR606249.skipm2n3.zip")
+    against = get_test_data("skipm2n3.zip")  # contains 2,47,63
+
+    if indexed_query:
+        query = index_siglist(
+            runtmp, query, runtmp.output("query"), scaled=100000, moltype="skipm2n3"
+        )
+
+    if indexed_against:
+        against = index_siglist(
+            runtmp,
+            against,
+            runtmp.output("db"),
+            moltype="skipm2n3",
+            toggle_internal_storage=toggle_internal_storage,
+        )
+
+    g_output = runtmp.output("gather.csv")
+    p_output = runtmp.output("prefetch.csv")
+
+    runtmp.sourmash(
+        "scripts",
+        "fastgather",
+        "--moltype",
+        "skipm2n3",
+        query,
+        against,
+        "-o",
+        g_output,
+        "-s",
+        "100000",
+    )
+    assert os.path.exists(g_output)
+
+    captured = capfd.readouterr()
+    print(captured.err)
+
+    df = pandas.read_csv(g_output)
+    assert len(df) == 3
+    keys = set(df.keys())
+    assert {
+        "query_filename",
+        "query_name",
+        "query_md5",
+        "match_name",
+        "match_md5",
+        "gather_result_rank",
+        "intersect_bp",
+    }.issubset(keys)
+
+    # CTB note: we do not need to worry about this warning for query from a
+    # RocksDB, since there is only one.
+    if indexed_against:
+        print("indexed against:", indexed_against)
+        assert (
+            "WARNING: loading all sketches from a RocksDB into memory!" in captured.err
+        )
diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py
@@ -487,6 +487,48 @@ def test_index_subdir(runtmp, toggle_internal_storage):
     runtmp.sourmash("scripts", "check", output)
 
 
+def test_index_skipm2n3(runtmp, toggle_internal_storage):
+    # test basic index!
+    sigzip = get_test_data("skipm2n3.zip")
+    output = runtmp.output("db.rocksdb")
+
+    runtmp.sourmash(
+        "scripts",
+        "index",
+        sigzip,
+        "-o",
+        output,
+        toggle_internal_storage,
+        "-m",
+        "skipm2n3",
+    )
+    assert os.path.exists(output)
+    print(runtmp.last_result.err)
+
+    assert "index is done" in runtmp.last_result.err
+
+
+def test_index_skipm1n3(runtmp, toggle_internal_storage):
+    # test basic index!
+    sigzip = get_test_data("skipm1n3.zip")
+    output = runtmp.output("db.rocksdb")
+
+    runtmp.sourmash(
+        "scripts",
+        "index",
+        sigzip,
+        "-o",
+        output,
+        toggle_internal_storage,
+        "-m",
+        "skipm1n3",
+    )
+    assert os.path.exists(output)
+    print(runtmp.last_result.err)
+
+    assert "index is done" in runtmp.last_result.err
+
+
 def test_index_misnamed_zipfile(runtmp, capfd):
     # test with a misnamed input zipfile (a .sig.gz file renamed as zip file)
     # (This is a generic test that checks to make sure misnamed zip files

diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import csv
 import pandas
 import sourmash
 from sourmash import index
@@ -1435,3 +1436,181 @@ def test_singlesketch_zip_output(runtmp):
     runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
     sig2 = sourmash.load_one_signature(output2)
     assert sig.minhash.hashes == sig2.minhash.hashes
+
+
+def test_manysketch_skipm2n3(runtmp, capfd):
+    fa_csv = runtmp.output("db-fa.csv")
+
+    fa1 = get_test_data("short.fa")
+    fa2 = get_test_data("short2.fa")
+    fa3 = get_test_data("short3.fa")
+    protfa1 = get_test_data("short-protein.fa")
+
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
+
+    output = runtmp.output("db.zip")
+
+    runtmp.sourmash(
+        "scripts",
+        "manysketch",
+        fa_csv,
+        "-o",
+        output,
+        "--param-str",
+        "dna,k=21,scaled=1",
+        "--param-str",
+        "skipm2n3,k=31,scaled=30",
+    )
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out  # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.err)
+
+    idx = sourmash.load_file_as_index(output)
+    sigs = list(idx.signatures())
+    print(sigs)
+
+    assert len(sigs) == 3  # 3 dna, 3 skipmer. But sourmash can only read the DNA sigs!!
+    # check moltypes, etc!
+    dna_md5sums = {
+        "short": "1474578c5c46dd09da4c2df29cf86621",
+        "short2": "4efeebd26644278e36b9553e018a851a",
+        "short3": "f85747ac4f473c4a71c1740d009f512b",
+    }
+    for sig in sigs:
+        if sig.minhash.is_dna:
+            assert sig.minhash.ksize == 21
+            assert sig.minhash.scaled == 1
+            print("DNA: ", sig.name, sig.md5sum())
+            assert sig.md5sum() == dna_md5sums[sig.name]
+
+    # read the file with python and check sigs
+    import zipfile, gzip, json
+
+    with zipfile.ZipFile(output, "r") as zf:
+        # Check the manifest exists
+        assert "SOURMASH-MANIFEST.csv" in zf.namelist()
+
+        expected_signatures = [
+            {
+                "name": "short",
+                "ksize": 31,
+                "scaled": 30,
+                "moltype": "skipm2n3",
+                "md5sum": "0486fcae73545363da9cd5bfcf18d322",
+            },
+            {
+                "name": "short3",
+                "ksize": 31,
+                "scaled": 30,
+                "moltype": "skipm2n3",
+                "md5sum": "890557b39ae66d3177035296818de7c6",
+            },
+            {
+                "name": "short2",
+                "ksize": 31,
+                "scaled": 30,
+                "moltype": "skipm2n3",
+                "md5sum": "ec6305f5d82e51659f3914d47fcc32ee",
+            },
+        ]
+        expected_signatures_dict = {exp["md5sum"]: exp for exp in expected_signatures}
+
+        # Read and parse the manifest
+        with zf.open("SOURMASH-MANIFEST.csv") as manifest_file:
+            manifest_data = manifest_file.read().decode("utf-8").splitlines()
+            manifest_data = [line for line in manifest_data if not line.startswith("#")]
+            manifest_reader = csv.DictReader(manifest_data)
+
+            for row in manifest_reader:
+                if row["moltype"] == "skipm2n3":
+                    print("Manifest Row:", row)
+
+                    # Validate row fields
+                    md5sum = row["md5"]
+                    assert (
+                        md5sum in expected_signatures_dict
+                    ), f"Unexpected md5sum: {md5sum}"
+                    expected = expected_signatures_dict[md5sum]
+                    assert (
+                        row["name"] == expected["name"]
+                    ), f"Name mismatch: {row['name']}"
+                    assert (
+                        int(row["ksize"]) == expected["ksize"]
+                    ), f"Ksize mismatch: {row['ksize']}"
+                    assert (
+                        row["moltype"] == expected["moltype"]
+                    ), f"Moltype mismatch: {row['moltype']}"
+
+                    sig_path = row["internal_location"]
+                    assert sig_path.startswith("signatures/")
+
+                    # Extract and read the signature file
+                    with zf.open(sig_path) as sig_gz:
+                        with gzip.open(sig_gz, "rt") as sig_file:
+                            sig_contents = json.load(sig_file)
+                            print("Signature Contents:", sig_contents)
+
+                            # Validate signature contents
+                            sig_data = sig_contents[0]
+                            print(sig_data)
+                            siginfo = sig_data["signatures"][0]
+                            assert (
+                                siginfo["md5sum"] == md5sum
+                            ), f"MD5 mismatch: {siginfo['md5sum']}"
+                            assert (
+                                siginfo["ksize"] == expected["ksize"]
+                            ), f"Ksize mismatch: {siginfo['ksize']}"
+                            assert (
+                                siginfo["molecule"] == expected["moltype"]
+                            ), f"Moltype mismatch: {siginfo['molecule']}"
+
+
+def test_singlesketch_skipm2n3(runtmp):
+    """Test singlesketch with skipm2n3."""
+    fa1 = get_test_data("short.fa")
+    output = runtmp.output("short.sig")
+
+    # Run the singlesketch command
+    runtmp.sourmash(
+        "scripts", "singlesketch", fa1, "-p", "skipm2n3,k=31,scaled=100", "-o", output
+    )
+
+    # Check if the output exists and contains the expected data
+    assert os.path.exists(output)
+    # Load the output signature file
+    import json
+
+    with open(output, "r") as f:
+        data = json.load(f)
+
+    # Extract the signature part from the JSON
+    signatures = data[0]["signatures"]
+
+    # Expected signature fields
+    expected_signatures = [
+        {
+            "name": "short.fa",
+            "ksize": 31,
+            "moltype": "skipm2n3",
+            "md5sum": "387d25c8e4b4878c78872efd13621491",
+        }
+    ]
+
+    # Check if the signatures match the expected
+    assert len(signatures) == len(
+        expected_signatures
+    ), "Number of signatures does not match."
+
+    for sig, expected in zip(signatures, expected_signatures):
+        assert sig["ksize"] == expected["ksize"], f"Unexpected ksize: {sig['ksize']}"
+        assert (
+            sig["molecule"] == expected["moltype"]
+        ), f"Unexpected moltype: {sig['molecule']}"
+        assert (
+            sig["md5sum"] == expected["md5sum"]
+        ), f"Unexpected md5sum: {sig['md5sum']}"
+        assert (
+            data[0]["name"] == expected["name"]
+        ), f"Unexpected name: {data[0]['name']}"