From f7a3a12d5bb25b6f499af1f88aaaaad40bfe672d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 17 Dec 2024 06:36:44 -0800 Subject: [PATCH 1/5] just the one commit, ma'am (#547) --- src/python/tests/sourmash_tst_utils.py | 45 +++++--------------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/src/python/tests/sourmash_tst_utils.py b/src/python/tests/sourmash_tst_utils.py index 86c97c57..0bb30b14 100644 --- a/src/python/tests/sourmash_tst_utils.py +++ b/src/python/tests/sourmash_tst_utils.py @@ -7,8 +7,7 @@ import collections import pprint -import pkg_resources -from pkg_resources import Requirement, resource_filename, ResolutionError +import importlib.metadata import traceback from io import open # pylint: disable=redefined-builtin from io import StringIO @@ -61,46 +60,18 @@ def index_siglist( return db -def scriptpath(scriptname="sourmash"): - """Return the path to the scripts, in both dev and install situations.""" - # note - it doesn't matter what the scriptname is here, as long as - # it's some script present in this version of sourmash. - - path = os.path.join(os.path.dirname(__file__), "../") - if os.path.exists(os.path.join(path, scriptname)): - return path - - path = os.path.join(os.path.dirname(__file__), "../../EGG-INFO/") - if os.path.exists(os.path.join(path, scriptname)): - return path - - for path in os.environ["PATH"].split(":"): - if os.path.exists(os.path.join(path, scriptname)): - return path - - def _runscript(scriptname): """Find & run a script with exec (i.e. not via os.system or subprocess).""" namespace = {"__name__": "__main__"} namespace["sys"] = globals()["sys"] - try: - pkg_resources.load_entry_point("sourmash", "console_scripts", scriptname)() - return 0 - except pkg_resources.ResolutionError: - pass - - path = scriptpath() - - scriptfile = os.path.join(path, scriptname) - if os.path.isfile(scriptfile): - if os.path.isfile(scriptfile): - exec( # pylint: disable=exec-used - compile(open(scriptfile).read(), scriptfile, "exec"), namespace - ) - return 0 - - return -1 + entry_points = importlib.metadata.entry_points( + group="console_scripts", name="sourmash" + ) + assert len(entry_points) == 1 + smash_cli = tuple(entry_points)[0].load() + smash_cli() + return 0 ScriptResults = collections.namedtuple("ScriptResults", ["status", "out", "err"]) From 18f4e0733850e7b88438d1571dc40bbbfb448ea1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 18 Dec 2024 10:31:59 -0800 Subject: [PATCH 2/5] update to sourmash prospective v0.18.0 --- Cargo.lock | 82 ++++++++++++++++++++++++++---------------------------- Cargo.toml | 3 +- 2 files changed, 42 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b1e7a9aa..d74fc93a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -206,7 +206,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.11.0", "lazy_static", "lazycell", "proc-macro2", @@ -808,10 +808,11 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -972,13 +973,12 @@ checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" [[package]] name = "nalgebra" -version = "0.32.6" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4" +checksum = "26aecdf64b707efd1310e3544d709c5c0ac61c13756046aaaba41be5c4f66a3b" dependencies = [ "approx", "matrixmultiply", - "nalgebra-macros", "num-complex", "num-rational", "num-traits", @@ -988,17 +988,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "nalgebra-macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.87", -] - [[package]] name = "ndarray" version = "0.15.6" @@ -1029,9 +1018,9 @@ dependencies = [ [[package]] name = "needletail" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29a3c5015d6985f33318d154fa0c41315eb2e7df29432c844c74a83434bfe21" +checksum = "de3de09e373770238e3d30eb1a9f09f4754134d0ef354d0570bc1203d2517257" dependencies = [ "buffer-redux", "bytecount", @@ -1075,6 +1064,16 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -1110,6 +1109,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ + "num-bigint", "num-integer", "num-traits", ] @@ -1585,9 +1585,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.6" +version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1" +checksum = "41589aba99537475bf697f2118357cad1c31590c5a1b9f6d9fc4ad6d07503661" dependencies = [ "bytemuck", "byteorder", @@ -1735,9 +1735,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simba" -version = "0.8.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae" +checksum = "b3a386a501cd104797982c15ae17aafe8b9261315b5d07e3ec803f2ea26be0fa" dependencies = [ "approx", "num-complex", @@ -1772,9 +1772,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.17.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e30f752d984b1d8456024973f8d89772b4ba248f592b77b57d59ad27a232a0" +version = "0.18.0" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=latest#f4f5187e7dc9b9c177e099bbf7f3f42556867328" dependencies = [ "az", "byteorder", @@ -1794,7 +1793,7 @@ dependencies = [ "md5", "memmap2", "murmurhash3", - "needletail 0.6.0", + "needletail 0.6.1", "niffler", "nohash-hasher", "num-iter", @@ -1856,9 +1855,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "statrs" -version = "0.17.1" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f697a07e4606a0a25c044de247e583a330dbb1731d11bc7350b81f48ad567255" +checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e" dependencies = [ "approx", "nalgebra", @@ -2117,9 +2116,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -2128,13 +2127,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn 2.0.87", @@ -2143,9 +2141,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2153,9 +2151,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", @@ -2166,15 +2164,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 4a845512..24234de3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.23.2", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.216", features = ["derive"] } -sourmash = { version = "0.17.2", features = ["branchwater"] } +#sourmash = { version = "0.17.2", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "latest", features = ["branchwater"] } serde_json = "1.0.133" niffler = "2.4.0" log = "0.4.22" From b510d8124055e1736485fdc76c4e74e70482b45c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 18 Dec 2024 11:17:56 -0800 Subject: [PATCH 3/5] remove tests for bad zip files :sweat_smile: --- src/python/tests/test_fastgather.py | 34 --------------- src/python/tests/test_fastmultigather.py | 55 ------------------------ src/python/tests/test_index.py | 21 --------- src/python/tests/test_manysearch.py | 27 ------------ src/python/tests/test_multisearch.py | 27 ------------ src/python/tests/test_pairwise.py | 23 ---------- 6 files changed, 187 deletions(-) diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py index e3ab4255..2fb1ed78 100644 --- a/src/python/tests/test_fastgather.py +++ b/src/python/tests/test_fastgather.py @@ -427,40 +427,6 @@ def test_bad_against_2(runtmp, capfd): ) -def test_bad_against_3(runtmp, capfd): - # test with a bad against (a .sig.gz file renamed as zip file) - query = get_test_data("SRR606249.sig.gz") - - sig2 = get_test_data("2.fa.sig.gz") - against_zip = runtmp.output("against.zip") - # cp sig2 into against_zip - with open(against_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - g_output = runtmp.output("gather.csv") - p_output = runtmp.output("prefetch.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash( - "scripts", - "fastgather", - query, - against_zip, - "-o", - g_output, - "--output-prefetch", - p_output, - "-s", - "100000", - ) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - @pytest.mark.xfail(reason="should work, bug") def test_against_multisigfile(runtmp, zip_against): # test against a sigfile that contains multiple sketches diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index 9d9c17e7..d47f62ad 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -561,34 +561,6 @@ def test_sig_query(runtmp, capfd, indexed): }.issubset(keys) -def test_bad_query(runtmp, capfd, indexed): - # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output("against.txt") - - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - make_file_list(against_list, [sig2, sig47, sig63]) - - if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "fastmultigather", query_zip, against_list) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_query(runtmp, capfd, indexed): # test missing query query_list = runtmp.output("query.txt") @@ -736,33 +708,6 @@ def test_bad_against(runtmp, capfd): ) -def test_bad_against_2(runtmp, capfd, zip_query): - # test with a bad against (a .sig.gz file renamed as zip file) - query = get_test_data("SRR606249.sig.gz") - query_list = runtmp.output("query.txt") - make_file_list(query_list, [query]) - - sig2 = get_test_data("2.fa.sig.gz") - against_zip = runtmp.output("against.zip") - # cp sig2 into query_zip - with open(against_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash( - "scripts", "fastmultigather", query_list, against_zip, "-s", "100000" - ) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_empty_against(runtmp, capfd): # test bad 'against' file - in this case, an empty one query = get_test_data("SRR606249.sig.gz") diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 72d40bee..4d7a5f04 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -429,27 +429,6 @@ def test_index_zipfile_multiparam(runtmp, capfd, toggle_internal_storage): runtmp.sourmash("scripts", "index", zipf, "-o", output, toggle_internal_storage) -def test_index_zipfile_bad(runtmp, capfd): - # test with a bad input zipfile (a .sig.gz file renamed as zip file) - sig2 = get_test_data("2.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "index", query_zip, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "Couldn't find End Of Central Directory Record" in captured.err - - def test_index_check(runtmp, toggle_internal_storage): # test check index siglist = runtmp.output("db-sigs.txt") diff --git a/src/python/tests/test_manysearch.py b/src/python/tests/test_manysearch.py index 6275b0cf..0e84fb00 100644 --- a/src/python/tests/test_manysearch.py +++ b/src/python/tests/test_manysearch.py @@ -556,33 +556,6 @@ def test_bad_query_2(runtmp, capfd, indexed): ) -def test_bad_query_3(runtmp, capfd): - # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output("against.txt") - - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - make_file_list(against_list, [sig2, sig47, sig63]) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "multisearch", query_zip, against_list, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_against(runtmp, capfd, indexed): # test with a missing against list query_list = runtmp.output("query.txt") diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index dfc65ee2..3be64fed 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -628,33 +628,6 @@ def test_bad_query(runtmp, capfd): ) -def test_bad_query_3(runtmp, capfd): - # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output("against.txt") - - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - make_file_list(against_list, [sig2, sig47, sig63]) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "multisearch", query_zip, against_list, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_against(runtmp, capfd, zip_db): # test with a missing against list query_list = runtmp.output("query.txt") diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 1a940043..c4021239 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -230,29 +230,6 @@ def test_bad_query(runtmp, capfd): ) -def test_bad_query_2(runtmp, capfd): - # test with a bad query (a .sig.gz file renamed as zip file) - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "pairwise", query_zip, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_query(runtmp, capfd, zip_db): # test with a missing query list query_list = runtmp.output("query.txt") From a950309e521f4efcb4dfe3320fc5b252148fa6ed Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 19 Dec 2024 14:47:44 -0800 Subject: [PATCH 4/5] MRG: add `--output-all-comparisons` to manysearch, multisearch, and pairwise (#544) * add --output-all-comparisons to manysearch * add doc, allow -A * update docs, add -A to all the things * cargo fmt * clippy * check for directories, fail appropriately * run black * test -A * ran black again * fix test * fiz * fiz --- Cargo.lock | 2 +- doc/README.md | 41 ++-- src/lib.rs | 14 +- src/manysearch.rs | 178 ++++++++++-------- src/manysearch_rocksdb.rs | 10 +- src/multisearch.rs | 3 +- src/pairwise.rs | 8 +- .../sourmash_plugin_branchwater/__init__.py | 22 +++ src/python/tests/test_manysearch.py | 110 +++++++++++ src/python/tests/test_multisearch.py | 74 ++++++++ src/python/tests/test_pairwise.py | 56 ++++++ src/utils/mod.rs | 16 +- 12 files changed, 428 insertions(+), 106 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b1e7a9aa..933b5a90 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -206,7 +206,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.11.0", "lazy_static", "lazycell", "proc-macro2", diff --git a/doc/README.md b/doc/README.md index 6da3b286..744348fb 100644 --- a/doc/README.md +++ b/doc/README.md @@ -29,13 +29,11 @@ be processed differently. The plugin commands are also a bit less user friendly, because (for now) we're more focused on speed than polish and user experience. -**Note:** As of v0.9.5, the outputs of `fastgather` and `fastmultigather` almost completely match the output of `sourmash gather`; see below for details. - ## Input file formats sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works with some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not (yet) supported natively by sourmash (through v4.8.11). -**As of v0.9.8, we recommend using zip files or standalone manifest CSVs pointing to zip files whenever you need to provide multiple sketches.** +**We recommend using zip files or standalone manifest CSVs pointing to zip files whenever you need to provide multiple sketches.** | command | command input | database format | | -------- | -------- | -------- | @@ -58,8 +56,8 @@ When working with large collections of small sketches such as genomes, we sugges * in particular, _single_ sketches can be loaded on demand, supporting lower memory requirements for certain kinds of searches. For all these reasons, zip files are the most efficient and effective -basic storage type for sketches in sourmash, and as of the branchwater -plugin v0.9.0, they are fully supported! +basic storage type for sketches in sourmash, and the branchwater +plugin fully supports them! You can create zipfiles with sourmash like so: ``` @@ -152,7 +150,7 @@ at the start in order to generate a manifest. To avoid memory issues, the signatures are not kept in memory, but instead re-loaded as described below for each command (see: Notes on concurrency and efficiency). This makes using pathlists less efficient than `zip` -files (as of v0.9.0) or manifests (as of v0.9.8). +files. ## Running the commands @@ -304,7 +302,7 @@ version of `sourmash gather`. sourmash scripts fastgather query.sig.gz database.zip -o results.csv --cores 4 ``` -As of v0.9.5, `fastgather` outputs the same columns as `sourmash gather`, with only a few exception +`fastgather` outputs the same columns as `sourmash gather`, with only a few exception * `match_name` is output instead of `name`; * `match_md5` is output instead of `md5`; * `match_filename` is output instead of `filename`, and the value is different; @@ -392,6 +390,11 @@ To report _any_ overlap between two sketches, set the threshold to 0. (This will produce many, many results when searching a collection of metagenomes!) +Using `-A/--output-all-comparisons` will ignore the threshold parameter +and output all comparisons done. Against a RocksDB database, only matches +with some overlap will be reported; with collections of sketches, all +pairs will be reported. + By default, `manysearch` will display the contents of the CSV file in a human-readable format. This can be disabled with `-N/--no-pretty-print` when executing large searches. @@ -452,14 +455,14 @@ pathlist format, and specify the desired output directory; we suggest using the `.rocksdb` extension for RocksDB databases, e.g. `-o gtdb-rs214-k31.rocksdb`. -By default, as of v0.9.7, `index` will store a copy of the sketches +By default, `index` will store a copy of the sketches along with the inverted index. This will substantially increase the disk space required for large databases. You can provide an optional `--no-internal-storage` to `index` to store them externally, which reduces the disk space needed for the index. Read below for technical details! -As of v0.9.8, `index` can take any of the supported input types, but +`index` can take any of the supported input types, but unless you are using a zip file or a pathlist of JSON files, it may need to load all the sketches into memory before indexing them. Moreover, you can only use external storage with a zip file. We @@ -470,9 +473,6 @@ the sketches are being loaded into memory. #### Internal vs external storage of sketches in a RocksDB index -(The below applies to v0.9.7 and later of the plugin; for v0.9.6 and -before, only external storage was implemented.) - RocksDB indexes support containment queries (a la the [branchwater application](https://github.com/sourmash-bio/branchwater)), as well as `gather`-style mixture decomposition (see @@ -489,7 +489,7 @@ the original source sketches used to construct the database, wherever they reside on your disk. The sketches *are not used* by `manysearch`, but *are used* by -`fastmultigather`: with v0.9.6 and later, you'll get an error if you +`fastmultigather`: you'll get an error if you run `fastmultigather` against a RocksDB index where the sketches cannot be loaded. @@ -521,6 +521,21 @@ in downstream software packages (this plugin, and [the branchwater application code](https://github.com/sourmash-bio/branchwater)). The above documentation applies to sourmash core v0.15.0. +## Notes on versioning and semantic versioning guarantees + +Unlike sourmash, +[which provides guarantees that command-line options and outputs will not change within minor versions](https://sourmash.readthedocs.io/en/latest/support.html#versioning-and-stability-of-features-and-apis), +we make no guarantees of stability within the branchwater plugin. This +is because the branchwater plugin is intended to move fast and +occasionally break things. + +Eventually we expect to provide all of the branchwater plugin's functionality within the sourmash package, at which time the sourmash guarantees will apply! + +However, we do not expect command line options and output file formats +to change quickly. + +We will also endeavor to avoid changing column names in CSV output, although, we may change the _order_ of column names on occasion. Please use the column headers (column names) to select specific columns. + ## Notes on concurrency and efficiency Each command does things somewhat differently, with implications for diff --git a/src/lib.rs b/src/lib.rs index afa5b857..8226a633 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,7 +28,7 @@ mod singlesketch; use camino::Utf8PathBuf as PathBuf; #[pyfunction] -#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, output_path=None, ignore_abundance=false))] +#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, output_path=None, ignore_abundance=false, output_all_comparisons=false))] #[allow(clippy::too_many_arguments)] fn do_manysearch( querylist_path: String, @@ -39,6 +39,7 @@ fn do_manysearch( moltype: String, output_path: Option, ignore_abundance: Option, + output_all_comparisons: Option, ) -> anyhow::Result { let againstfile_path: PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); @@ -46,6 +47,7 @@ fn do_manysearch( let allow_failed_sigpaths = true; let ignore_abundance = ignore_abundance.unwrap_or(false); + let output_all_comparisons = output_all_comparisons.unwrap_or(false); // if siglist_path is revindex, run rocksdb manysearch; otherwise run manysearch if is_revindex_database(&againstfile_path) { @@ -57,6 +59,7 @@ fn do_manysearch( threshold, output_path, allow_failed_sigpaths, + output_all_comparisons, ) { Ok(_) => Ok(0), Err(e) => { @@ -73,6 +76,7 @@ fn do_manysearch( output_path, allow_failed_sigpaths, ignore_abundance, + output_all_comparisons, ) { Ok(_) => Ok(0), Err(e) => { @@ -232,7 +236,7 @@ fn do_check(index: String, quick: bool) -> anyhow::Result { } #[pyfunction] -#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, estimate_ani, estimate_prob_overlap, output_path=None))] +#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, estimate_ani, estimate_prob_overlap, output_all_comparisons, output_path=None))] #[allow(clippy::too_many_arguments)] fn do_multisearch( querylist_path: String, @@ -243,6 +247,7 @@ fn do_multisearch( moltype: String, estimate_ani: bool, estimate_prob_overlap: bool, + output_all_comparisons: bool, output_path: Option, ) -> anyhow::Result { let _ = env_logger::try_init(); @@ -258,6 +263,7 @@ fn do_multisearch( allow_failed_sigpaths, estimate_ani, estimate_prob_overlap, + output_all_comparisons, output_path, ) { Ok(_) => Ok(0), @@ -270,7 +276,7 @@ fn do_multisearch( #[pyfunction] #[allow(clippy::too_many_arguments)] -#[pyo3(signature = (siglist_path, threshold, ksize, scaled, moltype, estimate_ani, write_all, output_path=None))] +#[pyo3(signature = (siglist_path, threshold, ksize, scaled, moltype, estimate_ani, write_all, output_all_comparisons, output_path=None))] fn do_pairwise( siglist_path: String, threshold: f64, @@ -279,6 +285,7 @@ fn do_pairwise( moltype: String, estimate_ani: bool, write_all: bool, + output_all_comparisons: bool, output_path: Option, ) -> anyhow::Result { let selection = build_selection(ksize, scaled, &moltype); @@ -290,6 +297,7 @@ fn do_pairwise( allow_failed_sigpaths, estimate_ani, write_all, + output_all_comparisons, output_path, ) { Ok(_) => Ok(0), diff --git a/src/manysearch.rs b/src/manysearch.rs index 968f1548..d5d629ca 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -9,7 +9,9 @@ use stats::{median, stddev}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; +use crate::utils::{ + csvwriter_thread, load_collection, ManySearchResult, ReportType, SmallSignature, +}; use sourmash::ani_utils::ani_from_containment; use sourmash::errors::SourmashError; use sourmash::selection::Selection; @@ -25,6 +27,7 @@ pub fn manysearch( output: Option, allow_failed_sigpaths: bool, ignore_abundance: bool, + output_all_comparisons: bool, ) -> Result<()> { // Load query collection let query_collection = load_collection( @@ -61,7 +64,8 @@ pub fn manysearch( )?; // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output let thrd = csvwriter_thread(recv, output); @@ -95,80 +99,18 @@ pub fn manysearch( >::try_into(against_sig) { for query in query_sketchlist.iter() { - // be paranoid and confirm scaled match. - if query.minhash.scaled() != common_scaled { - panic!("different query scaled"); - } - if against_mh.scaled() != common_scaled { - panic!("different against scaled"); - } - - let overlap = query - .minhash - .count_common(&against_mh, false) - .expect("incompatible sketches") - as f64; - - let query_size = query.minhash.size() as f64; - let containment_query_in_target = overlap / query_size; - // only calculate results if we have shared hashes - if containment_query_in_target > threshold { - let target_size = against_mh.size() as f64; - let containment_target_in_query = overlap / target_size; - - let max_containment = - containment_query_in_target.max(containment_target_in_query); - let jaccard = overlap / (target_size + query_size - overlap); - - let qani = ani_from_containment( - containment_query_in_target, - against_mh.ksize() as f64, - ); - let mani = ani_from_containment( - containment_target_in_query, - against_mh.ksize() as f64, - ); - let query_containment_ani = Some(qani); - let match_containment_ani = Some(mani); - let average_containment_ani = Some((qani + mani) / 2.); - let max_containment_ani = Some(f64::max(qani, mani)); - - let calc_abund_stats = - against_mh.track_abundance() && !ignore_abundance; - let ( - total_weighted_hashes, - n_weighted_found, - average_abund, - median_abund, - std_abund, - ) = if calc_abund_stats { - inflate_abundances(&query.minhash, &against_mh).ok()? - } else { - (None, None, None, None, None) - }; - - results.push(SearchResult { - query_name: query.name.clone(), - query_md5: query.md5sum.clone(), - match_name: against_name.clone(), - containment: containment_query_in_target, - intersect_hashes: overlap as u64, - ksize: query.minhash.ksize() as u16, - scaled: query.minhash.scaled(), - moltype: query.minhash.hash_function().to_string(), - match_md5: Some(against_md5.clone()), - jaccard: Some(jaccard), - max_containment: Some(max_containment), - average_abund, - median_abund, - std_abund, - query_containment_ani, - match_containment_ani, - average_containment_ani, - max_containment_ani, - n_weighted_found, - total_weighted_hashes, - }); + let sr = calculate_manysearch_result( + query, + &against_mh, + &against_name, + &against_md5, + threshold, + common_scaled, + ignore_abundance, + output_all_comparisons, + ); + if let Some(sr) = sr { + results.push(sr); } } } else { @@ -226,6 +168,9 @@ pub fn manysearch( Ok(()) } +// inflate_abundances: "borrow" the abundances from 'against' onto the +// intersection with 'query'. + fn inflate_abundances( query: &KmerMinHash, against: &KmerMinHash, @@ -241,10 +186,9 @@ fn inflate_abundances( > { let abunds: Vec; let sum_weighted: u64; - let sum_all_abunds: u64; + let sum_all_abunds: u64 = against.sum_abunds(); (abunds, sum_weighted) = query.inflated_abundances(against)?; - sum_all_abunds = against.sum_abunds(); let average_abund = sum_weighted as f64 / abunds.len() as f64; let median_abund = median(abunds.iter().cloned()).expect("error"); @@ -258,3 +202,81 @@ fn inflate_abundances( Some(std_abund), )) } + +// calculate_manysearch_result: calculate all the things + +fn calculate_manysearch_result( + query: &SmallSignature, + against_mh: &KmerMinHash, + against_name: &str, + against_md5: &str, + threshold: f64, + common_scaled: u32, + ignore_abundance: bool, + output_all_comparisons: bool, +) -> Option { + // be paranoid and confirm scaled match. + if query.minhash.scaled() != common_scaled { + panic!("different query scaled"); + } + if against_mh.scaled() != common_scaled { + panic!("different against scaled"); + } + + let overlap = query + .minhash + .count_common(against_mh, false) + .expect("incompatible sketches") as f64; + + let query_size = query.minhash.size() as f64; + let containment_query_in_target = overlap / query_size; + + // only calculate results if we have shared hashes + if containment_query_in_target > threshold || output_all_comparisons { + let target_size = against_mh.size() as f64; + let containment_target_in_query = overlap / target_size; + + let max_containment = containment_query_in_target.max(containment_target_in_query); + let jaccard = overlap / (target_size + query_size - overlap); + + let qani = ani_from_containment(containment_query_in_target, against_mh.ksize() as f64); + let mani = ani_from_containment(containment_target_in_query, against_mh.ksize() as f64); + let query_containment_ani = Some(qani); + let match_containment_ani = Some(mani); + let average_containment_ani = Some((qani + mani) / 2.); + let max_containment_ani = Some(f64::max(qani, mani)); + + let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; + let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = + if calc_abund_stats { + inflate_abundances(&query.minhash, against_mh).ok()? + } else { + (None, None, None, None, None) + }; + + let sr = ManySearchResult { + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against_name.to_string(), + containment: containment_query_in_target, + intersect_hashes: overlap as u64, + ksize: query.minhash.ksize() as u16, + scaled: query.minhash.scaled(), + moltype: query.minhash.hash_function().to_string(), + match_md5: Some(against_md5.to_string()), + jaccard: Some(jaccard), + max_containment: Some(max_containment), + average_abund, + median_abund, + std_abund, + query_containment_ani, + match_containment_ani, + average_containment_ani, + max_containment_ani, + n_weighted_found, + total_weighted_hashes, + }; + return Some(sr); + } + None +} diff --git a/src/manysearch_rocksdb.rs b/src/manysearch_rocksdb.rs index 30f75dd5..85fd8542 100644 --- a/src/manysearch_rocksdb.rs +++ b/src/manysearch_rocksdb.rs @@ -14,7 +14,7 @@ use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::SigStore; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, + csvwriter_thread, is_revindex_database, load_collection, ManySearchResult, ReportType, }; pub fn manysearch_rocksdb( @@ -24,6 +24,7 @@ pub fn manysearch_rocksdb( minimum_containment: f64, output: Option, allow_failed_sigpaths: bool, + output_all_comparisons: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { bail!("'{}' is not a valid RevIndex database", index); @@ -68,7 +69,8 @@ pub fn manysearch_rocksdb( )?; // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output let thrd = csvwriter_thread(recv, output); @@ -107,13 +109,13 @@ pub fn manysearch_rocksdb( // filter the matches for containment for (path, overlap) in matches { let containment = overlap as f64 / query_size as f64; - if containment >= minimum_containment { + if containment >= minimum_containment || output_all_comparisons { let query_containment_ani = Some(ani_from_containment( containment, query_mh.ksize() as f64, )); - results.push(SearchResult { + results.push(ManySearchResult { query_name: query_name.clone(), query_md5: query_md5.clone(), match_name: path.clone(), diff --git a/src/multisearch.rs b/src/multisearch.rs index 0aabba36..fb030a2b 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -143,6 +143,7 @@ pub fn multisearch( allow_failed_sigpaths: bool, estimate_ani: bool, estimate_prob_overlap: bool, + output_all_comparisons: bool, output: Option, ) -> Result<(), Box> { // Load all queries into memory at once. @@ -249,7 +250,7 @@ pub fn multisearch( let containment_query_in_target = overlap / query_size; - if containment_query_in_target > threshold { + if containment_query_in_target > threshold || output_all_comparisons { let containment_target_in_query = overlap / target_size; let max_containment = containment_query_in_target.max(containment_target_in_query); diff --git a/src/pairwise.rs b/src/pairwise.rs index b9e1a3b3..f5d2d362 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -20,6 +20,7 @@ pub fn pairwise( allow_failed_sigpaths: bool, estimate_ani: bool, write_all: bool, + output_all_comparisons: bool, output: Option, ) -> Result<(), Box> { // Load all sigs into memory at once. @@ -86,7 +87,10 @@ pub fn pairwise( let containment_adjusted_log10 = None; let tf_idf_score = None; - if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { + if containment_q1_in_q2 > threshold + || containment_q2_in_q1 > threshold + || output_all_comparisons + { let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); let jaccard = overlap / (query1_size + query2_size - overlap); let mut query_containment_ani = None; @@ -133,7 +137,7 @@ pub fn pairwise( eprintln!("Processed {} comparisons", i); } } - if write_all { + if write_all || output_all_comparisons { let mut query_containment_ani = None; let mut match_containment_ani = None; let mut average_containment_ani = None; diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 3de354aa..ba2f5798 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -107,6 +107,12 @@ def __init__(self, p): action="store_true", help="do not do expensive abundance calculations", ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons; against a RocksDB database, this will only output comparisons with some overlap", + ) def main(self, args): print_version() @@ -129,6 +135,7 @@ def main(self, args): args.moltype, args.output, args.ignore_abundance, + args.output_all_comparisons, ) if status == 0: notify(f"...manysearch is done! results in '{args.output}'") @@ -462,6 +469,12 @@ def __init__(self, p): action="store_true", help="estimate probability of overlap for significance ranking of search results, of the specific query and match, given all queries and possible matches", ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons", + ) def main(self, args): print_version() @@ -488,6 +501,7 @@ def main(self, args): args.moltype, args.ani, args.prob_significant_overlap, + args.output_all_comparisons, args.output, ) if status == 0: @@ -545,9 +559,16 @@ def __init__(self, p): ) p.add_argument( "--write-all", + "--write-self-comparisons", action="store_true", help="write self comparisons for all sketches", ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons", + ) def main(self, args): print_version() @@ -570,6 +591,7 @@ def main(self, args): args.moltype, args.ani, args.write_all, + args.output_all_comparisons, args.output, ) if status == 0: diff --git a/src/python/tests/test_manysearch.py b/src/python/tests/test_manysearch.py index 6275b0cf..f978272f 100644 --- a/src/python/tests/test_manysearch.py +++ b/src/python/tests/test_manysearch.py @@ -113,6 +113,116 @@ def test_simple(runtmp, zip_query, zip_against): assert max_ani == 0.9772 +def test_simple_output_all(runtmp, zip_query, zip_against): + # test basic execution! + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") + + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + + make_file_list(query_list, [sig2, sig47, sig63]) + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output("out.csv") + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) + + runtmp.sourmash( + "scripts", + "manysearch", + query_list, + against_list, + "-o", + output, + "-t", + "0.01", + "-A", + ) + assert os.path.exists(output) + + df = pandas.read_csv(output) + assert len(df) == 9 + + dd = df.to_dict(orient="index") + print(dd) + + for idx, row in dd.items(): + # identical? + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) + + else: + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) + match_ani = float(row["match_containment_ani"]) + average_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) + jaccard = round(jaccard, 4) + cont = round(cont, 4) + maxcont = round(maxcont, 4) + query_ani = round(query_ani, 4) + match_ani = round(match_ani, 4) + average_ani = round(average_ani, 4) + max_ani = round(max_ani, 4) + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + f"{query_ani:.04}", + f"{match_ani:.04}", + f"{average_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "NC_011665.1" and m == "NC_009661.1": + assert jaccard == 0.3207 + assert cont == 0.4828 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + assert query_ani == 0.9768 + assert match_ani == 0.9772 + assert average_ani == 0.977 + assert max_ani == 0.9772 + elif q == "NC_009661.1" and m == "NC_011665.1": + assert jaccard == 0.3207 + assert cont == 0.4885 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + assert query_ani == 0.9772 + assert match_ani == 0.9768 + assert average_ani == 0.977 + assert max_ani == 0.9772 + else: + assert jaccard == 0 + assert cont == 0 + assert maxcont == 0 + assert intersect_hashes == 0 + assert query_ani == 0 + assert match_ani == 0 + assert average_ani == 0 + assert max_ani == 0 + + def test_simple_abund(runtmp): # test with abund sig sig2 = get_test_data("2.fa.sig.gz") diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index dfc65ee2..7b82dc33 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -91,6 +91,80 @@ def test_simple_no_ani(runtmp, zip_query, zip_db): assert intersect_hashes == 2529 +def test_simple_no_ani_output_all(runtmp, zip_query, zip_db): + # test basic execution! + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") + + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + + make_file_list(query_list, [sig2, sig47, sig63]) + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output("out.csv") + + if zip_db: + against_list = zip_siglist(runtmp, against_list, runtmp.output("db.zip")) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output, "-A" + ) + assert os.path.exists(output) + + df = pandas.read_csv(output) + assert len(df) == 9 + + dd = df.to_dict(orient="index") + print(dd) + + for idx, row in dd.items(): + assert not ("prob_overlap" in row) + + # identical? + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert "query_containment_ani" not in row + assert "match_containment_ani" not in row + assert "average_containment_ani" not in row + assert "max_containment_ani" not in row + + else: + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float_round(row["containment"], 4) + jaccard = float_round(row["jaccard"], 4) + maxcont = float_round(row["max_containment"], 4) + + intersect_hashes = int(row["intersect_hashes"]) + + print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}") + + if q == "NC_011665.1" and m == "NC_009661.1": + assert jaccard == 0.3207 + assert cont == 0.4828 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + + elif q == "NC_009661.1" and m == "NC_011665.1": + assert jaccard == 0.3207 + assert cont == 0.4885 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + else: + assert jaccard == 0 + assert cont == 0 + assert maxcont == 0 + assert intersect_hashes == 0 + + def test_simple_prob_overlap(runtmp, zip_query, zip_db, indexed_query, indexed_against): # test basic execution! query_list = runtmp.output("query.txt") diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 1a940043..1bf469bf 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -20,6 +20,17 @@ def test_installed(runtmp): assert "usage: pairwise" in runtmp.last_result.err +def test_on_dir(runtmp, capfd): + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash( + "scripts", "pairwise", runtmp.output(""), "-o", runtmp.output("xxx.csv") + ) + + captured = capfd.readouterr() + print(captured.err) + assert "arbitrary directories are not supported" in captured.err + + def test_simple_no_ani(runtmp, capfd, zip_query, indexed): # test basic execution! query_list = runtmp.output("query.txt") @@ -80,6 +91,31 @@ def test_simple_no_ani(runtmp, capfd, zip_query, indexed): ) +def test_simple_no_ani_output_all(runtmp, capfd, zip_query, indexed): + # test basic execution! + query_list = runtmp.output("query.txt") + + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + + make_file_list(query_list, [sig2, sig47, sig63]) + + output = runtmp.output("out.csv") + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + + if indexed: + query_list = index_siglist(runtmp, query_list, runtmp.output("db")) + + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output, "-t", "-1", "-A") + assert os.path.exists(output) + + df = pandas.read_csv(output) + assert len(df) == 6 + + def test_simple_ani(runtmp, zip_query): # test basic execution! query_list = runtmp.output("query.txt") @@ -147,6 +183,26 @@ def test_simple_ani(runtmp, zip_query): assert q2_ani == 0.9772 assert avg_ani == 0.977 assert max_ani == 0.9772 + elif m == "NC_011665.1" and q == "NC_009661.1": + assert jaccard == 0.3207 + assert cont == 0.4885 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + assert q2_ani == 0.9768 + assert q1_ani == 0.9772 + assert avg_ani == 0.977 + assert max_ani == 0.9772 + elif q == m: + assert jaccard == 1 + else: + assert jaccard == 0 + assert cont == 0 + assert maxcont == 0 + assert intersect_hashes == 0 + assert q1_ani == 0 + assert q2_ani == 0 + assert avg_ani == 0 + assert max_ani == 0 def test_simple_threshold(runtmp, zip_query): diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 3f8ffc79..5674d13d 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -14,7 +14,7 @@ use serde::{Deserialize, Serialize}; // use rust_decimal::{MathematicalOps, Decimal}; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; -use std::fs::{create_dir_all, File}; +use std::fs::{create_dir_all, metadata, File}; use std::io::{BufWriter, Write}; use std::panic; use std::sync::atomic; @@ -33,7 +33,7 @@ use std::collections::{HashMap, HashSet}; use std::hash::{Hash, Hasher}; pub mod multicollection; -use multicollection::MultiCollection; +pub use multicollection::{MultiCollection, SmallSignature}; pub mod buildutils; use buildutils::{BuildCollection, BuildManifest}; @@ -555,6 +555,14 @@ pub fn load_collection( } }); + // we support RocksDB directory paths, but nothing else, unlike sourmash. + if collection.is_none() { + let path_metadata = metadata(sigpath.clone()).expect("getting path metadata failed"); + if path_metadata.is_dir() { + bail!("arbitrary directories are not supported as input"); + } + } + let collection = collection.or_else( || match MultiCollection::from_standalone_manifest(&sigpath) { @@ -617,7 +625,7 @@ pub fn load_collection( /// /// # Arguments /// -/// * `sketchlist` - A slice of loaded `SmallSignature` sketches. +/// * `collection` - A MultiCollection. /// * `skipped_paths` - # paths that contained no compatible sketches. /// * `failed_paths` - # paths that failed to load. /// * `report_type` - ReportType Enum (Query or Against). Used to specify @@ -999,7 +1007,7 @@ pub fn is_revindex_database(path: &camino::Utf8PathBuf) -> bool { } #[derive(Serialize)] -pub struct SearchResult { +pub struct ManySearchResult { pub query_name: String, pub query_md5: String, pub match_name: String, From 9d99130590485a705fa05d9c2a15554b5d423371 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 19 Dec 2024 23:01:41 +0000 Subject: [PATCH 5/5] Bump pyo3 from 0.23.2 to 0.23.3 (#540) Bumps [pyo3](https://github.com/pyo3/pyo3) from 0.23.2 to 0.23.3. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.23.2...v0.23.3) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 22 +++++++++++----------- Cargo.toml | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 933b5a90..9cf74d5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -206,7 +206,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.11.0", + "itertools 0.12.1", "lazy_static", "lazycell", "proc-macro2", @@ -1353,9 +1353,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.23.2" +version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f54b3d09cbdd1f8c20650b28e7b09e338881482f4aa908a5f61a00c98fba2690" +checksum = "e484fd2c8b4cb67ab05a318f1fd6fa8f199fcc30819f08f07d200809dba26c15" dependencies = [ "anyhow", "cfg-if", @@ -1372,9 +1372,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.23.2" +version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3015cf985888fe66cfb63ce0e321c603706cd541b7aec7ddd35c281390af45d8" +checksum = "dc0e0469a84f208e20044b98965e1561028180219e35352a2afaf2b942beff3b" dependencies = [ "once_cell", "target-lexicon", @@ -1382,9 +1382,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.23.2" +version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fca7cd8fd809b5ac4eefb89c1f98f7a7651d3739dfb341ca6980090f554c270" +checksum = "eb1547a7f9966f6f1a0f0227564a9945fe36b90da5a93b3933fc3dc03fae372d" dependencies = [ "libc", "pyo3-build-config", @@ -1392,9 +1392,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.2" +version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34e657fa5379a79151b6ff5328d9216a84f55dc93b17b08e7c3609a969b73aa0" +checksum = "fdb6da8ec6fa5cedd1626c886fc8749bdcbb09424a86461eb8cdf096b7c33257" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -1404,9 +1404,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.23.2" +version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "295548d5ffd95fd1981d2d3cf4458831b21d60af046b729b6fd143b0ba7aee2f" +checksum = "38a385202ff5a92791168b1136afae5059d3ac118457bb7bc304c197c2d33e7d" dependencies = [ "heck 0.5.0", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index 4a845512..24241e9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ name = "sourmash_plugin_branchwater" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.23.2", features = ["extension-module", "anyhow"] } +pyo3 = { version = "0.23.3", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.216", features = ["derive"] } sourmash = { version = "0.17.2", features = ["branchwater"] }