From 480f319621ff430a13474b17ccd8530cf1f9acdc Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 10:14:33 -0700 Subject: [PATCH 01/31] refactor & rename & consolidate --- src/python/tests/conftest.py | 4 ++ src/python/tests/sourmash_tst_utils.py | 17 +++++++++ src/python/tests/test_cluster.py | 10 +---- .../{test_gather.py => test_fastgather.py} | 30 +-------------- ...multigather.py => test_fastmultigather.py} | 17 +-------- src/python/tests/test_index.py | 12 +----- src/python/tests/test_manysearch.py | 38 +------------------ src/python/tests/test_multisearch.py | 29 +------------- src/python/tests/test_pairwise.py | 24 +----------- src/utils.rs | 2 +- 10 files changed, 29 insertions(+), 154 deletions(-) rename src/python/tests/{test_gather.py => test_fastgather.py} (96%) rename src/python/tests/{test_multigather.py => test_fastmultigather.py} (99%) diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index 052837f6..3f7021a1 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -16,6 +16,10 @@ def toggle_internal_storage(request): def zip_query(request): return request.param +@pytest.fixture(params=[True, False]) +def zip_db(request): + return request.param + @pytest.fixture(params=[True, False]) def zip_against(request): return request.param diff --git a/src/python/tests/sourmash_tst_utils.py b/src/python/tests/sourmash_tst_utils.py index 7c99b1b6..f4ad4927 100644 --- a/src/python/tests/sourmash_tst_utils.py +++ b/src/python/tests/sourmash_tst_utils.py @@ -14,6 +14,23 @@ from io import StringIO +def get_test_data(filename): + thisdir = os.path.dirname(__file__) + return os.path.join(thisdir, 'test-data', filename) + + +def make_file_list(filename, paths): + with open(filename, 'wt') as fp: + fp.write("\n".join(paths)) + fp.write("\n") + + +def zip_siglist(runtmp, siglist, db): + runtmp.sourmash('sig', 'cat', siglist, + '-o', db) + return db + + def scriptpath(scriptname='sourmash'): """Return the path to the scripts, in both dev and install situations.""" # note - it doesn't matter what the scriptname is here, as long as diff --git a/src/python/tests/test_cluster.py b/src/python/tests/test_cluster.py index 6e153946..4ae12173 100644 --- a/src/python/tests/test_cluster.py +++ b/src/python/tests/test_cluster.py @@ -2,15 +2,7 @@ import pytest from . import sourmash_tst_utils as utils - -def get_test_data(filename): - thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) - -def make_file_list(filename, paths): - with open(filename, 'wt') as fp: - fp.write("\n".join(paths)) - fp.write("\n") +from .sourmash_tst_utils import get_test_data, make_file_list def test_installed(runtmp): diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_fastgather.py similarity index 96% rename from src/python/tests/test_gather.py rename to src/python/tests/test_fastgather.py index 4ab4c6de..bd2ca5a4 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_fastgather.py @@ -4,23 +4,7 @@ import sourmash from . import sourmash_tst_utils as utils - - -def get_test_data(filename): - thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) - - -def make_file_list(filename, paths): - with open(filename, 'wt') as fp: - fp.write("\n".join(paths)) - fp.write("\n") - - -def zip_siglist(runtmp, siglist, db): - runtmp.sourmash('sig', 'cat', siglist, - '-o', db) - return db +from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist) def test_installed(runtmp): @@ -30,7 +14,6 @@ def test_installed(runtmp): assert 'usage: fastgather' in runtmp.last_result.err -@pytest.mark.parametrize('zip_against', [False, True]) def test_simple(runtmp, zip_against): # test basic execution! query = get_test_data('SRR606249.sig.gz') @@ -58,7 +41,6 @@ def test_simple(runtmp, zip_against): assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) -@pytest.mark.parametrize('zip_against', [False, True]) def test_simple_with_prefetch(runtmp, zip_against): # test basic execution! query = get_test_data('SRR606249.sig.gz') @@ -93,7 +75,6 @@ def test_simple_with_prefetch(runtmp, zip_against): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} -@pytest.mark.parametrize('zip_against', [False, True]) def test_missing_query(runtmp, capfd, zip_against): # test missing query query = runtmp.output('no-such-file') @@ -122,7 +103,6 @@ def test_missing_query(runtmp, capfd, zip_against): assert 'Error: No such file or directory' in captured.err -@pytest.mark.parametrize('zip_against', [False, True]) def test_bad_query(runtmp, capfd, zip_against): # test non-sig query query = runtmp.output('no-such-file') @@ -154,7 +134,6 @@ def test_bad_query(runtmp, capfd, zip_against): assert 'Error: Fastgather requires a single query sketch. Check input:' in captured.err -@pytest.mark.parametrize('zip_against', [False, True]) def test_missing_against(runtmp, capfd, zip_against): # test missing against query = get_test_data('SRR606249.sig.gz') @@ -278,7 +257,6 @@ def test_bad_against_3(runtmp, capfd): assert 'InvalidArchive' in captured.err -@pytest.mark.parametrize('zip_against', [False, True]) def test_against_multisigfile(runtmp, zip_against): # test against a sigfile that contains multiple sketches query = get_test_data('SRR606249.sig.gz') @@ -311,7 +289,6 @@ def test_against_multisigfile(runtmp, zip_against): # @CTB this is a bug :(. It should load multiple sketches properly! -@pytest.mark.parametrize('zip_against', [False, True]) def test_query_multisigfile(runtmp, capfd, zip_against): # test with a sigfile that contains multiple sketches against_list = runtmp.output('against.txt') @@ -341,7 +318,6 @@ def test_query_multisigfile(runtmp, capfd, zip_against): assert "Error: Fastgather requires a single query sketch. Check input:" in captured.err -@pytest.mark.parametrize('zip_against', [False, True]) def test_against_nomatch(runtmp, capfd, zip_against): # test with 'against' file containing a non-matching ksize query = get_test_data('SRR606249.sig.gz') @@ -370,7 +346,6 @@ def test_against_nomatch(runtmp, capfd, zip_against): assert 'WARNING: skipped 1 search paths - no compatible signatures.' in captured.err -@pytest.mark.parametrize('zip_against', [False, True]) def test_md5s(runtmp, zip_against): # check that the correct md5sums (of the original sketches) are in # the output files @@ -424,7 +399,6 @@ def test_md5s(runtmp, zip_against): assert ss.md5sum() in md5s -@pytest.mark.parametrize('zip_against', [False, True]) def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_against): # the column names should be strict subsets of sourmash prefetch cols query = get_test_data('SRR606249.sig.gz') @@ -466,7 +440,6 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_against): assert diff_keys == set(['unique_intersect_bp', 'median_abund', 'f_match_orig', 'std_abund', 'average_abund', 'f_unique_to_query', 'remaining_bp', 'f_unique_weighted', 'sum_weighted_found', 'total_weighted_hashes', 'n_unique_weighted_found', 'f_orig_query', 'f_match']) -@pytest.mark.parametrize('zip_against', [False, True]) def test_fastgather_gatherout_as_picklist(runtmp, zip_against): # should be able to use fastgather gather output as picklist query = get_test_data('SRR606249.sig.gz') @@ -508,7 +481,6 @@ def test_fastgather_gatherout_as_picklist(runtmp, zip_against): assert picklist_df.equals(full_df) -@pytest.mark.parametrize('zip_against', [False, True]) def test_fastgather_prefetchout_as_picklist(runtmp, zip_against): # should be able to use fastgather prefetch output as picklist query = get_test_data('SRR606249.sig.gz') diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_fastmultigather.py similarity index 99% rename from src/python/tests/test_multigather.py rename to src/python/tests/test_fastmultigather.py index b3649ee9..942c1d1b 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -8,17 +8,7 @@ import sourmash from . import sourmash_tst_utils as utils - - -def get_test_data(filename): - thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) - - -def make_file_list(filename, paths): - with open(filename, 'wt') as fp: - fp.write("\n".join(paths)) - fp.write("\n") +from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist) def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA', @@ -37,11 +27,6 @@ def test_installed(runtmp): assert 'usage: fastmultigather' in runtmp.last_result.err -def zip_siglist(runtmp, siglist, db): - runtmp.sourmash('sig', 'cat', siglist, - '-o', db) - return db - def test_simple(runtmp, zip_against): # test basic execution! query = get_test_data('SRR606249.sig.gz') diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 69faf8ae..140fe799 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -5,17 +5,7 @@ import shutil from . import sourmash_tst_utils as utils - - -def get_test_data(filename): - thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) - - -def make_file_list(filename, paths): - with open(filename, 'wt') as fp: - fp.write("\n".join(paths)) - fp.write("\n") +from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist) def test_installed(runtmp): diff --git a/src/python/tests/test_manysearch.py b/src/python/tests/test_manysearch.py index 6deb5c3b..ab0f5762 100644 --- a/src/python/tests/test_manysearch.py +++ b/src/python/tests/test_manysearch.py @@ -4,17 +4,7 @@ import sourmash from . import sourmash_tst_utils as utils - - -def get_test_data(filename): - thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) - - -def make_file_list(filename, paths): - with open(filename, 'wt') as fp: - fp.write("\n".join(paths)) - fp.write("\n") +from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist) def test_installed(runtmp): @@ -23,10 +13,6 @@ def test_installed(runtmp): assert 'usage: manysearch' in runtmp.last_result.err -def zip_siglist(runtmp, siglist, db): - runtmp.sourmash('sig', 'cat', siglist, - '-o', db) - return db def index_siglist(runtmp, siglist, db, ksize=31, scaled=1000, moltype='DNA'): # build index @@ -35,8 +21,6 @@ def index_siglist(runtmp, siglist, db, ksize=31, scaled=1000, moltype='DNA'): '--moltype', moltype) return db -@pytest.mark.parametrize("zip_query", [False, True]) -@pytest.mark.parametrize("zip_against", [False, True]) def test_simple(runtmp, zip_query, zip_against): # test basic execution! query_list = runtmp.output('query.txt') @@ -192,7 +176,6 @@ def test_simple_abund(runtmp): assert total_weighted_hashes == 73489 -@pytest.mark.parametrize("zip_query", [False, True]) def test_simple_indexed(runtmp, zip_query): # test basic execution! query_list = runtmp.output('query.txt') @@ -249,8 +232,6 @@ def test_simple_indexed(runtmp, zip_query): assert query_ani == 0.9772 -@pytest.mark.parametrize("indexed", [False, True]) -@pytest.mark.parametrize("zip_query", [False, True]) def test_simple_with_cores(runtmp, capfd, indexed, zip_query): # test basic execution with -c argument (that it runs, at least!) query_list = runtmp.output('query.txt') @@ -283,8 +264,6 @@ def test_simple_with_cores(runtmp, capfd, indexed, zip_query): assert " using 4 threads" in result.err -@pytest.mark.parametrize("indexed", [False, True]) -@pytest.mark.parametrize("zip_query", [False, True]) def test_simple_threshold(runtmp, indexed, zip_query): # test with a simple threshold => only 3 results query_list = runtmp.output('query.txt') @@ -313,7 +292,6 @@ def test_simple_threshold(runtmp, indexed, zip_query): assert len(df) == 3 -@pytest.mark.parametrize("indexed", [False, True]) def test_simple_manifest(runtmp, indexed): # test with a simple threshold => only 3 results query_list = runtmp.output('query.txt') @@ -347,8 +325,6 @@ def test_simple_manifest(runtmp, indexed): assert len(df) == 3 -@pytest.mark.parametrize("indexed", [False, True]) -@pytest.mark.parametrize("zip_query", [False, True]) def test_missing_query(runtmp, capfd, indexed, zip_query): # test with a missing query list query_list = runtmp.output('query.txt') @@ -379,7 +355,6 @@ def test_missing_query(runtmp, capfd, indexed, zip_query): assert 'Error: No such file or directory' in captured.err -@pytest.mark.parametrize("indexed", [False, True]) def test_sig_query(runtmp, capfd, indexed): # test with a single sig query (a .sig.gz file) against_list = runtmp.output('against.txt') @@ -399,7 +374,6 @@ def test_sig_query(runtmp, capfd, indexed): '-o', output) -@pytest.mark.parametrize("indexed", [False, True]) def test_bad_query_2(runtmp, capfd, indexed): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') @@ -453,7 +427,6 @@ def test_bad_query_3(runtmp, capfd): assert 'InvalidArchive' in captured.err -@pytest.mark.parametrize("indexed", [False, True]) def test_missing_against(runtmp, capfd, indexed): # test with a missing against list query_list = runtmp.output('query.txt') @@ -524,7 +497,6 @@ def test_bad_against(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -@pytest.mark.parametrize("indexed", [False, True]) def test_empty_query(runtmp, indexed, capfd): # test with an empty query list query_list = runtmp.output('query.txt') @@ -552,8 +524,6 @@ def test_empty_query(runtmp, indexed, capfd): assert "No query signatures loaded, exiting." in captured.err -@pytest.mark.parametrize("indexed", [False, True]) -@pytest.mark.parametrize("zip_query", [False, True]) def test_nomatch_query(runtmp, capfd, indexed, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? query_list = runtmp.output('query.txt') @@ -584,8 +554,6 @@ def test_nomatch_query(runtmp, capfd, indexed, zip_query): assert 'WARNING: skipped 1 query paths - no compatible signatures.' in captured.err -@pytest.mark.parametrize("zip_against", [False, True]) -@pytest.mark.parametrize("indexed", [False, True]) def test_load_only_one_bug(runtmp, capfd, indexed, zip_against): # check that we behave properly when presented with multiple against # sketches @@ -619,8 +587,6 @@ def test_load_only_one_bug(runtmp, capfd, indexed, zip_against): assert not 'WARNING: no compatible sketches in path ' in captured.err -@pytest.mark.parametrize("zip_query", [False, True]) -@pytest.mark.parametrize("indexed", [False, True]) def test_load_only_one_bug_as_query(runtmp, capfd, indexed, zip_query): # check that we behave properly when presented with multiple query # sketches in one file, with only one matching. @@ -656,8 +622,6 @@ def test_load_only_one_bug_as_query(runtmp, capfd, indexed, zip_query): assert not 'WARNING: no compatible sketches in path ' in captured.err -@pytest.mark.parametrize("zip_query", [False, True]) -@pytest.mark.parametrize("indexed", [False, True]) def test_md5(runtmp, indexed, zip_query): # test that md5s match what was in the original files, not downsampled etc. query_list = runtmp.output('query.txt') diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index 611b0f81..87553615 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -5,17 +5,7 @@ import sourmash from . import sourmash_tst_utils as utils - - -def get_test_data(filename): - thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) - - -def make_file_list(filename, paths): - with open(filename, 'wt') as fp: - fp.write("\n".join(paths)) - fp.write("\n") +from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist) def test_installed(runtmp): @@ -24,13 +14,7 @@ def test_installed(runtmp): assert 'usage: multisearch' in runtmp.last_result.err -def zip_siglist(runtmp, siglist, db): - runtmp.sourmash('sig', 'cat', siglist, - '-o', db) - return db -@pytest.mark.parametrize("zip_query", [False, True]) -@pytest.mark.parametrize("zip_db", [False, True]) def test_simple_no_ani(runtmp, zip_query, zip_db): # test basic execution! query_list = runtmp.output('query.txt') @@ -99,8 +83,6 @@ def test_simple_no_ani(runtmp, zip_query, zip_db): assert intersect_hashes == 2529 -@pytest.mark.parametrize("zip_query", [False, True]) -@pytest.mark.parametrize("zip_db", [False, True]) def test_simple_ani(runtmp, zip_query, zip_db): # test basic execution! query_list = runtmp.output('query.txt') @@ -186,8 +168,6 @@ def test_simple_ani(runtmp, zip_query, zip_db): assert max_ani == 0.9772 -@pytest.mark.parametrize("zip_query", [False, True]) -@pytest.mark.parametrize("zip_db", [False, True]) def test_simple_threshold(runtmp, zip_query, zip_db): # test with a simple threshold => only 3 results query_list = runtmp.output('query.txt') @@ -243,7 +223,6 @@ def test_simple_manifest(runtmp): assert len(df) == 3 -@pytest.mark.parametrize("zip_query", [False, True]) def test_missing_query(runtmp, capfd, zip_query): # test with a missing query list query_list = runtmp.output('query.txt') @@ -344,7 +323,6 @@ def test_bad_query_3(runtmp, capfd): assert 'InvalidArchive' in captured.err -@pytest.mark.parametrize("zip_db", [False, True]) def test_missing_against(runtmp, capfd, zip_db): # test with a missing against list query_list = runtmp.output('query.txt') @@ -445,7 +423,6 @@ def test_empty_query(runtmp, capfd): # @CTB -@pytest.mark.parametrize("zip_query", [False, True]) def test_nomatch_query(runtmp, capfd, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? query_list = runtmp.output('query.txt') @@ -474,7 +451,6 @@ def test_nomatch_query(runtmp, capfd, zip_query): assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err -@pytest.mark.parametrize("zip_db", [False, True]) def test_load_only_one_bug(runtmp, capfd, zip_db): # check that we behave properly when presented with multiple against # sketches @@ -506,7 +482,6 @@ def test_load_only_one_bug(runtmp, capfd, zip_db): assert not 'WARNING: no compatible sketches in path' in captured.err -@pytest.mark.parametrize("zip_query", [False, True]) def test_load_only_one_bug_as_query(runtmp, capfd, zip_query): # check that we behave properly when presented with multiple query # sketches in one file, with only one matching. @@ -538,8 +513,6 @@ def test_load_only_one_bug_as_query(runtmp, capfd, zip_query): assert not 'WARNING: no compatible sketches in path ' in captured.err -@pytest.mark.parametrize("zip_query", [False, True]) -@pytest.mark.parametrize("zip_db", [False, True]) def test_md5(runtmp, zip_query, zip_db): # test that md5s match what was in the original files, not downsampled etc. query_list = runtmp.output('query.txt') diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 3869b3d4..c8264069 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -5,17 +5,7 @@ import sourmash from . import sourmash_tst_utils as utils - - -def get_test_data(filename): - thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) - - -def make_file_list(filename, paths): - with open(filename, 'wt') as fp: - fp.write("\n".join(paths)) - fp.write("\n") +from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist) def test_installed(runtmp): @@ -24,13 +14,7 @@ def test_installed(runtmp): assert 'usage: pairwise' in runtmp.last_result.err -def zip_siglist(runtmp, siglist, db): - runtmp.sourmash('sig', 'cat', siglist, - '-o', db) - return db - -@pytest.mark.parametrize("zip_query", [False, True]) def test_simple_no_ani(runtmp, zip_query): # test basic execution! query_list = runtmp.output('query.txt') @@ -81,7 +65,6 @@ def test_simple_no_ani(runtmp, zip_query): assert intersect_hashes == 2529 -@pytest.mark.parametrize("zip_query", [False, True]) def test_simple_ani(runtmp, zip_query): # test basic execution! query_list = runtmp.output('query.txt') @@ -140,7 +123,6 @@ def test_simple_ani(runtmp, zip_query): assert max_ani == 0.9772 -@pytest.mark.parametrize("zip_query", [False, True]) def test_simple_threshold(runtmp, zip_query): # test with a simple threshold => only 3 results query_list = runtmp.output('query.txt') @@ -248,7 +230,6 @@ def test_bad_query_2(runtmp, capfd): assert 'InvalidArchive' in captured.err -@pytest.mark.parametrize("zip_db", [False, True]) def test_missing_query(runtmp, capfd, zip_db): # test with a missing query list query_list = runtmp.output('query.txt') @@ -290,7 +271,6 @@ def test_empty_query(runtmp): # @CTB -@pytest.mark.parametrize("zip_query", [False, True]) def test_nomatch_query(runtmp, capfd, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? query_list = runtmp.output('query.txt') @@ -317,7 +297,6 @@ def test_nomatch_query(runtmp, capfd, zip_query): assert 'WARNING: skipped 1 analysis paths - no compatible signatures' in captured.err -@pytest.mark.parametrize("zip_db", [False, True]) def test_load_only_one_bug(runtmp, capfd, zip_db): # check that we behave properly when presented with multiple query # sketches @@ -347,7 +326,6 @@ def test_load_only_one_bug(runtmp, capfd, zip_db): assert not 'WARNING: no compatible sketches in path ' in captured.err -@pytest.mark.parametrize("zip_query", [False, True]) def test_md5(runtmp, zip_query): # test that md5s match what was in the original files, not downsampled etc. query_list = runtmp.output('query.txt') diff --git a/src/utils.rs b/src/utils.rs index 4209413e..a5ef93a2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1091,7 +1091,7 @@ pub fn consume_query_by_gather( pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { let hash_function = match moltype { - "dna" => HashFunctions::Murmur64Dna, + "DNA" => HashFunctions::Murmur64Dna, "protein" => HashFunctions::Murmur64Protein, "dayhoff" => HashFunctions::Murmur64Dayhoff, "hp" => HashFunctions::Murmur64Hp, From e6b1c5be3df76c56386480ff9fd5cdcc7ce90848 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 10:40:56 -0700 Subject: [PATCH 02/31] remove 'lower' --- src/python/sourmash_plugin_branchwater/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 4c923e34..cf11ed56 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -69,7 +69,6 @@ def __init__(self, p): def main(self, args): print_version() notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") - args.moltype = args.moltype.lower() num_threads = set_thread_pool(args.cores) notify(f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") @@ -117,7 +116,6 @@ def __init__(self, p): def main(self, args): print_version() notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp}") - args.moltype = args.moltype.lower() num_threads = set_thread_pool(args.cores) @@ -164,7 +162,6 @@ def __init__(self, p): def main(self, args): print_version() notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp}") - args.moltype = args.moltype.lower() num_threads = set_thread_pool(args.cores) @@ -209,7 +206,6 @@ def __init__(self, p): def main(self, args): notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} ") - args.moltype = args.moltype.lower() num_threads = set_thread_pool(args.cores) @@ -274,7 +270,6 @@ def __init__(self, p): def main(self, args): print_version() notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") - args.moltype = args.moltype.lower() num_threads = set_thread_pool(args.cores) @@ -321,7 +316,6 @@ def __init__(self, p): def main(self, args): print_version() notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") - args.moltype = args.moltype.lower() num_threads = set_thread_pool(args.cores) From 3bef912a3c3f23960e6021b0c691b1a76db14756 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 10:47:37 -0700 Subject: [PATCH 03/31] trying out a lower level (index) API --- src/branch_api.rs | 38 +++++++++++++++++++ src/lib.rs | 8 ++++ .../sourmash_plugin_branchwater/__init__.py | 1 + 3 files changed, 47 insertions(+) create mode 100644 src/branch_api.rs diff --git a/src/branch_api.rs b/src/branch_api.rs new file mode 100644 index 00000000..8c19efa2 --- /dev/null +++ b/src/branch_api.rs @@ -0,0 +1,38 @@ +/// Lower-level Python API implementation for sourmash_plugin_branchwater +use pyo3::prelude::*; + +use crate::utils::build_selection; +use crate::utils::load_collection; +use crate::utils::ReportType; + +#[pyclass] +pub struct BranchCollection { + #[pyo3(get)] + pub val: i32, +} + + +#[pyfunction] +pub fn api_load_collection( + location: String, + ksize: u8, + scaled: usize, + moltype: String, +) -> PyResult> { + let selection = build_selection(ksize, scaled, &moltype); + +/* + match load_collection(&location, &selection, ReportType::Query, true) { + Ok(_) => Ok(0), + Err(e) => { + eprintln!("Error: {e}"); + Ok(1) + } +} + */ + let obj = + Python::with_gil(|py| Py::new(py, BranchCollection { val: 1001 }).unwrap()); + Ok(obj) +} + + diff --git a/src/lib.rs b/src/lib.rs index ea49b0b6..0eb733aa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ use pyo3::prelude::*; extern crate simple_error; mod utils; +mod python_api; use crate::utils::build_selection; use crate::utils::is_revindex_database; mod check; @@ -320,8 +321,10 @@ fn do_cluster( } } + #[pymodule] fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { + // top level 'scripts' commands m.add_function(wrap_pyfunction!(do_manysearch, m)?)?; m.add_function(wrap_pyfunction!(do_fastgather, m)?)?; m.add_function(wrap_pyfunction!(do_fastmultigather, m)?)?; @@ -332,5 +335,10 @@ fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult m.add_function(wrap_pyfunction!(do_multisearch, m)?)?; m.add_function(wrap_pyfunction!(do_pairwise, m)?)?; m.add_function(wrap_pyfunction!(do_cluster, m)?)?; + + // lower level API stuff + m.add_class::()?; + m.add_function(wrap_pyfunction!(python_api::api_load_collection, m)?)?; + Ok(()) } diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index cf11ed56..ebfe2e0f 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -7,6 +7,7 @@ import importlib.metadata from . import sourmash_plugin_branchwater +from . import sourmash_plugin_branchwater as api from . import prettyprint __version__ = importlib.metadata.version("sourmash_plugin_branchwater") From ba2e96748b76f6886591670bb65a49e2a12768db Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 10:48:27 -0700 Subject: [PATCH 04/31] fix --- src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0eb733aa..26d2ad1e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; extern crate simple_error; mod utils; -mod python_api; +mod branch_api; use crate::utils::build_selection; use crate::utils::is_revindex_database; mod check; @@ -337,8 +337,8 @@ fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult m.add_function(wrap_pyfunction!(do_cluster, m)?)?; // lower level API stuff - m.add_class::()?; - m.add_function(wrap_pyfunction!(python_api::api_load_collection, m)?)?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(branch_api::api_load_collection, m)?)?; Ok(()) } From 5c7a7600d47ff817e3d2bb258b1da5c7f46abb6f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 11:21:57 -0700 Subject: [PATCH 05/31] compiling... --- src/branch_api.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/branch_api.rs b/src/branch_api.rs index 8c19efa2..1a271341 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -1,6 +1,7 @@ /// Lower-level Python API implementation for sourmash_plugin_branchwater use pyo3::prelude::*; +use sourmash::collection::Collection; use crate::utils::build_selection; use crate::utils::load_collection; use crate::utils::ReportType; @@ -9,6 +10,7 @@ use crate::utils::ReportType; pub struct BranchCollection { #[pyo3(get)] pub val: i32, + collection: Collection } @@ -21,18 +23,8 @@ pub fn api_load_collection( ) -> PyResult> { let selection = build_selection(ksize, scaled, &moltype); -/* - match load_collection(&location, &selection, ReportType::Query, true) { - Ok(_) => Ok(0), - Err(e) => { - eprintln!("Error: {e}"); - Ok(1) - } -} - */ + let collection = load_collection(&location, &selection, ReportType::Query, true).unwrap(); let obj = - Python::with_gil(|py| Py::new(py, BranchCollection { val: 1001 }).unwrap()); + Python::with_gil(|py| Py::new(py, BranchCollection { val: 1001, collection }).unwrap()); Ok(obj) } - - From 87b4563aace15a9a3a472b5583e77cdac9fb2f0f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 11:24:42 -0700 Subject: [PATCH 06/31] y'know ...tests --- src/python/tests/test_index_api.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/python/tests/test_index_api.py diff --git a/src/python/tests/test_index_api.py b/src/python/tests/test_index_api.py new file mode 100644 index 00000000..d24b46e8 --- /dev/null +++ b/src/python/tests/test_index_api.py @@ -0,0 +1,18 @@ +import sourmash_plugin_branchwater as branch +from . import sourmash_tst_utils as utils +from .sourmash_tst_utils import get_test_data + + +def test_basic(): + sigfile = get_test_data('SRR606249.sig.gz') + res = branch.api.api_load_collection(sigfile, 31, 100_000, 'DNA') + assert res.val == 1001 + + +def test_fail(): + sigfile = get_test_data('XXX_SRR606249.sig.gz') + try: + res = branch.api.api_load_collection(sigfile, 31, 100_000, 'DNA') + except: + pass + # @CTB should do something better here ;) From 92c9f3c7932d8e41276a7351365150ce5a23b657 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 11:47:42 -0700 Subject: [PATCH 07/31] add __len__ --- src/branch_api.rs | 7 +++++++ src/python/tests/test_index_api.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/src/branch_api.rs b/src/branch_api.rs index 1a271341..04f7e751 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -13,6 +13,13 @@ pub struct BranchCollection { collection: Collection } +#[pymethods] +impl BranchCollection { + pub fn __len__(&self) -> PyResult { + Ok(self.collection.len()) + } +} + #[pyfunction] pub fn api_load_collection( diff --git a/src/python/tests/test_index_api.py b/src/python/tests/test_index_api.py index d24b46e8..9bc1c238 100644 --- a/src/python/tests/test_index_api.py +++ b/src/python/tests/test_index_api.py @@ -7,9 +7,11 @@ def test_basic(): sigfile = get_test_data('SRR606249.sig.gz') res = branch.api.api_load_collection(sigfile, 31, 100_000, 'DNA') assert res.val == 1001 + assert len(res) == 1 def test_fail(): + # try to load a (nonexistent) collection sigfile = get_test_data('XXX_SRR606249.sig.gz') try: res = branch.api.api_load_collection(sigfile, 31, 100_000, 'DNA') From 02e3e7d04e2a99e84ad5567ae8f457de33d10502 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 12:07:33 -0700 Subject: [PATCH 08/31] victoir --- src/branch_api.rs | 37 ++++++++++++++++++++++++++++++ src/python/tests/test_index_api.py | 13 +++++++++++ 2 files changed, 50 insertions(+) diff --git a/src/branch_api.rs b/src/branch_api.rs index 04f7e751..6b19a6b4 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -2,10 +2,35 @@ use pyo3::prelude::*; use sourmash::collection::Collection; +use sourmash::manifest::{Manifest, Record}; use crate::utils::build_selection; use crate::utils::load_collection; use crate::utils::ReportType; +#[pyclass] +pub struct BranchRecord { + record: Record +} + +#[pymethods] +impl BranchRecord { + pub fn get_name(&self) -> PyResult { + Ok(self.record.name().clone()) + } +} + +#[pyclass] +pub struct BranchManifest { + manifest: Manifest +} + +#[pymethods] +impl BranchManifest { + pub fn __len__(&self) -> PyResult { + Ok(self.manifest.len()) + } +} + #[pyclass] pub struct BranchCollection { #[pyo3(get)] @@ -18,6 +43,18 @@ impl BranchCollection { pub fn __len__(&self) -> PyResult { Ok(self.collection.len()) } + + pub fn get_manifest(&self) -> PyResult> { + let manifest: Manifest = self.collection.manifest().clone(); + let obj = Python::with_gil(|py| Py::new(py, BranchManifest { manifest: manifest }).unwrap()); + Ok(obj) + } + pub fn get_first_record(&self) -> PyResult> { + let records: Vec<_> = self.collection.iter().collect(); + let first_record = records.first().unwrap().1; + let obj = Python::with_gil(|py| Py::new(py, BranchRecord { record: first_record.clone() }).unwrap()); + Ok(obj) + } } diff --git a/src/python/tests/test_index_api.py b/src/python/tests/test_index_api.py index 9bc1c238..cb13b238 100644 --- a/src/python/tests/test_index_api.py +++ b/src/python/tests/test_index_api.py @@ -18,3 +18,16 @@ def test_fail(): except: pass # @CTB should do something better here ;) + + +def test_basic_get_manifest(): + sigfile = get_test_data('SRR606249.sig.gz') + res = branch.api.api_load_collection(sigfile, 31, 100_000, 'DNA') + mf = res.get_manifest() + print(mf, dir(mf)) + assert len(mf) == 1 + + rec = res.get_first_record() + print(rec, dir(rec)) + + print(rec.get_name()) From 9209b63cf164c8d88aab90ea20cd32ecb7492237 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 12:31:53 -0700 Subject: [PATCH 09/31] more better --- src/branch_api.rs | 51 ++++++++++++++++++++++++------ src/lib.rs | 3 +- src/python/tests/test_index_api.py | 5 +++ 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/branch_api.rs b/src/branch_api.rs index 6b19a6b4..9d2366e8 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -1,15 +1,15 @@ /// Lower-level Python API implementation for sourmash_plugin_branchwater use pyo3::prelude::*; -use sourmash::collection::Collection; -use sourmash::manifest::{Manifest, Record}; use crate::utils::build_selection; use crate::utils::load_collection; use crate::utils::ReportType; +use sourmash::collection::Collection; +use sourmash::manifest::{Manifest, Record}; #[pyclass] pub struct BranchRecord { - record: Record + record: Record, } #[pymethods] @@ -21,7 +21,7 @@ impl BranchRecord { #[pyclass] pub struct BranchManifest { - manifest: Manifest + manifest: Manifest, } #[pymethods] @@ -35,7 +35,7 @@ impl BranchManifest { pub struct BranchCollection { #[pyo3(get)] pub val: i32, - collection: Collection + collection: Collection, } #[pymethods] @@ -46,17 +46,40 @@ impl BranchCollection { pub fn get_manifest(&self) -> PyResult> { let manifest: Manifest = self.collection.manifest().clone(); - let obj = Python::with_gil(|py| Py::new(py, BranchManifest { manifest: manifest }).unwrap()); + let obj = + Python::with_gil(|py| Py::new(py, BranchManifest { manifest: manifest }).unwrap()); Ok(obj) } pub fn get_first_record(&self) -> PyResult> { let records: Vec<_> = self.collection.iter().collect(); let first_record = records.first().unwrap().1; - let obj = Python::with_gil(|py| Py::new(py, BranchRecord { record: first_record.clone() }).unwrap()); + let obj = Python::with_gil(|py| { + Py::new( + py, + BranchRecord { + record: first_record.clone(), + }, + ) + .unwrap() + }); Ok(obj) } -} + pub fn get_records(&self) -> PyResult> { + let records: Vec<_> = self.collection.iter().collect(); + + let obj = records + .iter() + .map(|x| { + BranchRecord { + record: x.1.clone(), + } + }) + .collect(); + + Ok(obj) + } +} #[pyfunction] pub fn api_load_collection( @@ -68,7 +91,15 @@ pub fn api_load_collection( let selection = build_selection(ksize, scaled, &moltype); let collection = load_collection(&location, &selection, ReportType::Query, true).unwrap(); - let obj = - Python::with_gil(|py| Py::new(py, BranchCollection { val: 1001, collection }).unwrap()); + let obj = Python::with_gil(|py| { + Py::new( + py, + BranchCollection { + val: 1001, + collection, + }, + ) + .unwrap() + }); Ok(obj) } diff --git a/src/lib.rs b/src/lib.rs index 26d2ad1e..be8d327a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,8 +4,8 @@ use pyo3::prelude::*; #[macro_use] extern crate simple_error; -mod utils; mod branch_api; +mod utils; use crate::utils::build_selection; use crate::utils::is_revindex_database; mod check; @@ -321,7 +321,6 @@ fn do_cluster( } } - #[pymodule] fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { // top level 'scripts' commands diff --git a/src/python/tests/test_index_api.py b/src/python/tests/test_index_api.py index cb13b238..40f44bd4 100644 --- a/src/python/tests/test_index_api.py +++ b/src/python/tests/test_index_api.py @@ -31,3 +31,8 @@ def test_basic_get_manifest(): print(rec, dir(rec)) print(rec.get_name()) + + print(res.get_records()) + for rec in res.get_records(): + print(rec.get_name()) + assert 0 From a7a723ea225513b7005bc28ffeb7f6f468bcffeb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 13:53:05 -0700 Subject: [PATCH 10/31] add as load_from plugin --- pyproject.toml | 3 +++ src/branch_api.rs | 24 +++++++++++++++++-- .../sourmash_plugin_branchwater/__init__.py | 9 +++++++ src/python/tests/test_index_api.py | 4 ++-- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70a26945..f9571c04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,9 @@ authors = [ requires = ["maturin>=1.4.0,<2"] build-backend = "maturin" +[project.entry-points."sourmash.load_from"] +collection_reader = "sourmash_plugin_branchwater:load_collection" + [project.entry-points."sourmash.cli_script"] manysearch = "sourmash_plugin_branchwater:Branchwater_Manysearch" multisearch = "sourmash_plugin_branchwater:Branchwater_Multisearch" diff --git a/src/branch_api.rs b/src/branch_api.rs index 9d2366e8..12ec81cc 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -17,6 +17,10 @@ impl BranchRecord { pub fn get_name(&self) -> PyResult { Ok(self.record.name().clone()) } + + pub fn to_row_dict(&self, py: Python) -> PyResult { + Ok("hello".to_string()) + } } #[pyclass] @@ -29,12 +33,22 @@ impl BranchManifest { pub fn __len__(&self) -> PyResult { Ok(self.manifest.len()) } + pub fn _check_row_values(&self) -> PyResult { + Ok(true) + } } #[pyclass] pub struct BranchCollection { #[pyo3(get)] - pub val: i32, + pub location: String, + + #[pyo3(get)] + pub is_database: bool, + + #[pyo3(get)] + pub has_manifest: bool, + collection: Collection, } @@ -44,6 +58,7 @@ impl BranchCollection { Ok(self.collection.len()) } + #[getter] pub fn get_manifest(&self) -> PyResult> { let manifest: Manifest = self.collection.manifest().clone(); let obj = @@ -53,6 +68,8 @@ impl BranchCollection { pub fn get_first_record(&self) -> PyResult> { let records: Vec<_> = self.collection.iter().collect(); let first_record = records.first().unwrap().1; + + // @CTB: can I turn this into something automatic? let obj = Python::with_gil(|py| { Py::new( py, @@ -77,6 +94,7 @@ impl BranchCollection { }) .collect(); + // @CTB: this does the GIL grabbing as needed? Ok(obj) } } @@ -95,8 +113,10 @@ pub fn api_load_collection( Py::new( py, BranchCollection { - val: 1001, + location: location, collection, + is_database: false, + has_manifest: true, }, ) .unwrap() diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index ebfe2e0f..6cd3539a 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -3,6 +3,7 @@ import argparse from sourmash.plugins import CommandLinePlugin from sourmash.logging import notify +from sourmash.exceptions import IndexNotLoaded import os import importlib.metadata @@ -38,6 +39,14 @@ def set_thread_pool(user_cores): return actual_rayon_cores +def load_collection(path, *, traverse_yield_all=False, cache_size=0): + try: + return api.api_load_collection(path, 31, 100_000, 'DNA') + except: + raise IndexNotLoaded +load_collection.priority = 20 + + class Branchwater_Manysearch(CommandLinePlugin): command = 'manysearch' description = 'search many metagenomes for contained genomes' diff --git a/src/python/tests/test_index_api.py b/src/python/tests/test_index_api.py index 40f44bd4..23a07cc9 100644 --- a/src/python/tests/test_index_api.py +++ b/src/python/tests/test_index_api.py @@ -6,7 +6,7 @@ def test_basic(): sigfile = get_test_data('SRR606249.sig.gz') res = branch.api.api_load_collection(sigfile, 31, 100_000, 'DNA') - assert res.val == 1001 + assert res.location == sigfile assert len(res) == 1 @@ -23,7 +23,7 @@ def test_fail(): def test_basic_get_manifest(): sigfile = get_test_data('SRR606249.sig.gz') res = branch.api.api_load_collection(sigfile, 31, 100_000, 'DNA') - mf = res.get_manifest() + mf = res.manifest print(mf, dir(mf)) assert len(mf) == 1 From 5c673a3428402966a06ce621ff6bbaaa8ad08fb3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 14:50:36 -0700 Subject: [PATCH 11/31] I mean it seems to work, right? --- src/branch_api.rs | 45 ++++++++++++++++++++++++++++-- src/python/tests/test_index_api.py | 5 ++-- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/branch_api.rs b/src/branch_api.rs index 12ec81cc..f4c1ae2c 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -6,6 +6,7 @@ use crate::utils::load_collection; use crate::utils::ReportType; use sourmash::collection::Collection; use sourmash::manifest::{Manifest, Record}; +use pyo3::types::{IntoPyDict, PyDict, PyList}; #[pyclass] pub struct BranchRecord { @@ -18,11 +19,39 @@ impl BranchRecord { Ok(self.record.name().clone()) } - pub fn to_row_dict(&self, py: Python) -> PyResult { - Ok("hello".to_string()) + #[getter] + pub fn get_as_row<'py>(&self, py: Python<'py>) -> PyResult> { + let dict = { + let key_vals: Vec<(&str, PyObject)> = vec![ + ("ksize", self.record.ksize().to_object(py)), + ("moltype", self.record.moltype().to_string().to_object(py)), + ("scaled", 1000.to_object(py)), // self.record.scaled().to_object(py)), + ("num", 0.to_object(py)), //, self.record.num().to_object(py)), + ("with_abundance", false.to_object(py)), // self.record.with_abundance().to_object(py)), + ("n_hashes", 100.to_object(py)), // self.record.n_hashes().to_object(py)), + ]; + key_vals.into_py_dict_bound(py) + }; + Ok(dict) } } +/* +impl IntoPyDict for I +where + T: PyDictItem + I: IntoIterator +fn into_py_dict(self, py: Python<'_>) -> Bound<'_, PyDict> { + let dict = PyDict::new(py); + for item in self { + dict.set_item(item.key(), item.value()) + .expect("Failed to set_item on dict"); + } + dict +} +} +*/ + #[pyclass] pub struct BranchManifest { manifest: Manifest, @@ -36,6 +65,15 @@ impl BranchManifest { pub fn _check_row_values(&self) -> PyResult { Ok(true) } + #[getter] + pub fn get_rows<'py>(&self, py: Python<'py>) -> PyResult>> { + let res: Vec<_> = self.manifest.iter().map(|x| { BranchRecord { + record: x.clone(), + }.get_as_row(py).unwrap() + }).collect(); + + Ok(res) + } } #[pyclass] @@ -82,7 +120,8 @@ impl BranchCollection { Ok(obj) } - pub fn get_records(&self) -> PyResult> { + #[getter] + pub fn get_rows(&self) -> PyResult> { let records: Vec<_> = self.collection.iter().collect(); let obj = records diff --git a/src/python/tests/test_index_api.py b/src/python/tests/test_index_api.py index 23a07cc9..a146b65d 100644 --- a/src/python/tests/test_index_api.py +++ b/src/python/tests/test_index_api.py @@ -29,10 +29,11 @@ def test_basic_get_manifest(): rec = res.get_first_record() print(rec, dir(rec)) + print('ZZZ', rec.as_row) print(rec.get_name()) - print(res.get_records()) - for rec in res.get_records(): + print(mf.rows) + for rec in mf.rows: print(rec.get_name()) assert 0 From c8bf07417d36042f3566a92277c9985c04eb1906 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 16:47:14 -0700 Subject: [PATCH 12/31] provide wrapper objects, for now --- src/branch_api.rs | 3 -- .../sourmash_plugin_branchwater/__init__.py | 39 ++++++++++++++++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/branch_api.rs b/src/branch_api.rs index f4c1ae2c..535fab56 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -84,9 +84,6 @@ pub struct BranchCollection { #[pyo3(get)] pub is_database: bool, - #[pyo3(get)] - pub has_manifest: bool, - collection: Collection, } diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 6cd3539a..22351d2a 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -39,9 +39,46 @@ def set_thread_pool(user_cores): return actual_rayon_cores +class BranchwaterManifestWrapper: + def __init__(self, mf_obj): + self.obj = mf_obj + + def _check_row_values(self): + return self.obj._check_row_values() + + @property + def rows(self): + return self.obj.rows + + +class BranchwaterCollectionWrapper: + def __init__(self, coll_obj): + self.obj = coll_obj + + @property + def location(self): + return self.obj.location + + @property + def is_database(self): + return self.obj.is_database + + @property + def has_manifest(self): + return self.obj.has_manifest + + @property + def manifest(self): + return BranchwaterManifestWrapper(self.obj.manifest) + + def __len__(self): + return len(self.obj) + + def load_collection(path, *, traverse_yield_all=False, cache_size=0): try: - return api.api_load_collection(path, 31, 100_000, 'DNA') + coll_obj = api.api_load_collection(path, 31, 100_000, 'DNA') + return BranchwaterCollectionWrapper(coll_obj) except: raise IndexNotLoaded load_collection.priority = 20 From 8d4fdb5f0d23846cde9fcf6638b19308ec0da2e3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 17 Aug 2024 17:53:28 -0700 Subject: [PATCH 13/31] shrug --- src/branch_api.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/branch_api.rs b/src/branch_api.rs index 535fab56..f4c1ae2c 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -84,6 +84,9 @@ pub struct BranchCollection { #[pyo3(get)] pub is_database: bool, + #[pyo3(get)] + pub has_manifest: bool, + collection: Collection, } From 0d7a556f95cfbbd6bf6f58a47a30522f695342c5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 06:01:47 -0700 Subject: [PATCH 14/31] add cargo doc output for private fn --- .cargo/config.toml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 00000000..a05a706a --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,2 @@ +[build] +rustdocflags = ["--document-private-items"] From 1da0cf305d6b2ddaa509be9f416c9043517f3c01 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 06:09:48 -0700 Subject: [PATCH 15/31] add a few comments/docs --- src/lib.rs | 8 +++++++- src/utils.rs | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 7d623ea7..2a7fa58d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,8 @@ -/// Python interface Rust code for sourmash_plugin_branchwater. +//! Rust-to-Pyton interface code for sourmash_plugin_branchwater, using pyo3. +//! +//! If you're using Rust, you're probably most interested in +//! [utils](utils/index.html) + use pyo3::prelude::*; #[macro_use] @@ -322,6 +326,8 @@ fn do_cluster( } } +/// Module interface for the `sourmash_plugin_branchwater` extension module. + #[pymodule] fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(do_manysearch, m)?)?; diff --git a/src/utils.rs b/src/utils.rs index 4209413e..bd6cbba3 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,4 +1,5 @@ -/// Utility functions for sourmash_plugin_branchwater. +//! Utility functions for `sourmash_plugin_branchwater`. + use rayon::prelude::*; use sourmash::encodings::HashFunctions; use sourmash::selection::Select; From 2e7f027a2159ba95ab7549b665f73f94a921c3f4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 07:12:23 -0700 Subject: [PATCH 16/31] switch to dev version of sourmash --- Cargo.lock | 23 +++++++++++------------ Cargo.toml | 3 ++- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c386a6c8..3f362496 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -740,9 +740,9 @@ checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "libloading" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", "windows-targets", @@ -772,9 +772,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.18" +version = "1.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c15da26e5af7e25c90b37a2d75cdbf940cf4a55316de9d84c679c9b8bfabf82e" +checksum = "fdc53a7799a7496ebc9fd29f31f7df80e83c9bda5299768af5f9e59eeea74647" dependencies = [ "cc", "pkg-config", @@ -795,9 +795,9 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lz4-sys" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9764018d143cc854c9f17f0b907de70f14393b1f502da6375dce70f00514eb3" +checksum = "109de74d5d2353660401699a4174a4ff23fcc649caf553df71933c7fb45ad868" dependencies = [ "cc", "libc", @@ -1481,18 +1481,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.206" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.206" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", @@ -1551,8 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8655e639cc4a32fa1422629c9b4ff603ee09cf6d04a97eacd37594382472d437" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#d23ef6bdeaa8655443083628905e077e88f56a21" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 353ec2d1..7422983e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.2", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.206", features = ["derive"] } -sourmash = { version = "0.15.0", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "misc_rs_updates", features = ["branchwater"] } +#sourmash = { version = "0.15.0", features = ["branchwater"] } serde_json = "1.0.124" niffler = "2.4.0" log = "0.4.22" From 6b9e00fb20c3815c3829efa49af5a7d0edbfdc9c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 07:32:56 -0700 Subject: [PATCH 17/31] tracking --- Cargo.lock | 2 +- src/multisearch.rs | 11 +++++++++++ src/utils.rs | 28 ++++++++++++++++++---------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f362496..dbf39445 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1551,7 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#d23ef6bdeaa8655443083628905e077e88f56a21" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#952ccf64aa102a2b7e1a22e75710d03dcedc77e2" dependencies = [ "az", "byteorder", diff --git a/src/multisearch.rs b/src/multisearch.rs index 19d2264d..1695ff60 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -60,12 +60,23 @@ pub fn multisearch( let processed_cmp = AtomicUsize::new(0); let ksize = selection.ksize().unwrap() as f64; + if queries.len() == 0 { + eprintln!("No query sketches present. Exiting."); + return Err(anyhow::anyhow!("foo").into()); // @CTB + } + + if against.len() == 0 { + eprintln!("No search sketches present. Exiting."); + return Err(anyhow::anyhow!("foo").into()); // @CTB + } + let send = against .par_iter() .filter_map(|against| { let mut results = vec![]; // search for matches & save containment. for query in queries.iter() { + eprintln!("XXXX"); let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); if i % 100000 == 0 && i > 0 { eprintln!("Processed {} comparisons", i); diff --git a/src/utils.rs b/src/utils.rs index bd6cbba3..8a6b83f6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -443,16 +443,24 @@ pub fn load_sketches( let sketchinfo: Vec = collection .par_iter() .filter_map(|(_idx, record)| { - let sig = collection.sig_from_record(record).ok()?; - let selected_sig = sig.clone().select(selection).ok()?; - let minhash = selected_sig.minhash()?.clone(); - - Some(SmallSignature { - location: record.internal_location().to_string(), - name: sig.name(), - md5sum: sig.md5sum(), - minhash, - }) + match collection.sig_from_record(record) { + Ok(sig) => { + let selected_sig = sig.clone().select(selection).ok()?; + let minhash = selected_sig.minhash()?.clone(); + + Some(SmallSignature { + location: record.internal_location().to_string(), + name: sig.name(), + md5sum: sig.md5sum(), + minhash, + }) + }, + Err(_) => { + eprintln!("FAILED to load sketch from '{}'", + record.internal_location()); + None + } + } }) .collect(); From 2747935be3286f8175fd8e193747cb4aa7532b0a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 09:18:18 -0700 Subject: [PATCH 18/31] cleaner --- Cargo.lock | 2 +- src/multisearch.rs | 4 ++-- src/utils.rs | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dbf39445..5a9f3ba7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1551,7 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#952ccf64aa102a2b7e1a22e75710d03dcedc77e2" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#38013af8efad824e4396b51e4354c9a8b5f9f606" dependencies = [ "az", "byteorder", diff --git a/src/multisearch.rs b/src/multisearch.rs index 1695ff60..9fc3e8ae 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -62,12 +62,12 @@ pub fn multisearch( if queries.len() == 0 { eprintln!("No query sketches present. Exiting."); - return Err(anyhow::anyhow!("foo").into()); // @CTB + return Err(anyhow::anyhow!("failed to load query sketches").into()); } if against.len() == 0 { eprintln!("No search sketches present. Exiting."); - return Err(anyhow::anyhow!("foo").into()); // @CTB + return Err(anyhow::anyhow!("failed to load search sketches").into()); } let send = against diff --git a/src/utils.rs b/src/utils.rs index 8a6b83f6..80d4328b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -433,7 +433,7 @@ fn process_prefix_csv( Ok((results, n_fastas)) } -// Load all compatible minhashes from a collection into memory +// Load all compatible minhashes from a collection into memory, in parallel; // also store sig name and md5 alongside, as we usually need those pub fn load_sketches( collection: Collection, @@ -573,6 +573,7 @@ fn collection_from_manifest( Err(anyhow!("could not read as manifest: '{}'", sigpath)) } else { // If the manifest is not empty, proceed to create and return the Collection + eprintln!("collection from manifest!"); Ok(Collection::new( manifest, InnerStorage::new( From 4f49ef808cd43931e7521b53345c95ff026ed41d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 09:20:38 -0700 Subject: [PATCH 19/31] cleanup --- Cargo.lock | 2 +- src/multisearch.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5a9f3ba7..63397361 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1551,7 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#38013af8efad824e4396b51e4354c9a8b5f9f606" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#08459a8dd72d209e13ccbc7726bfaeb8245e52c0" dependencies = [ "az", "byteorder", diff --git a/src/multisearch.rs b/src/multisearch.rs index 9fc3e8ae..6edad2c1 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -76,7 +76,6 @@ pub fn multisearch( let mut results = vec![]; // search for matches & save containment. for query in queries.iter() { - eprintln!("XXXX"); let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); if i % 100000 == 0 && i > 0 { eprintln!("Processed {} comparisons", i); From af1c82de2eb41df6389ec8792265153d41e2ba5e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 11:18:42 -0700 Subject: [PATCH 20/31] load rocksdb natively --- Cargo.lock | 2 +- src/utils.rs | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 63397361..f1f47c19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1551,7 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#08459a8dd72d209e13ccbc7726bfaeb8245e52c0" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#e67415c8a7cdddc81be58654ff1dc7041e62ad4d" dependencies = [ "az", "byteorder", diff --git a/src/utils.rs b/src/utils.rs index 80d4328b..b7d6507c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -546,6 +546,8 @@ impl std::fmt::Display for ReportType { } } +/// Load a collection from a .zip file. + pub fn collection_from_zipfile(sigpath: &Path, report_type: &ReportType) -> Result { match Collection::from_zipfile(sigpath) { Ok(collection) => Ok(collection), @@ -553,6 +555,17 @@ pub fn collection_from_zipfile(sigpath: &Path, report_type: &ReportType) -> Resu } } +/// Load a collection from a RocksDB. + +pub fn collection_from_rocksdb(sigpath: &Path, report_type: &ReportType) -> Result { + match Collection::from_rocksdb(sigpath) { + Ok(collection) => Ok(collection), + Err(_) => bail!("failed to load {} rocksdb: '{}'", report_type, sigpath), + } +} + +/// Load a collection from a manifest CSV. + fn collection_from_manifest( sigpath: &Path, report_type: &ReportType, @@ -586,6 +599,8 @@ fn collection_from_manifest( } } +/// Load a collection from a list of paths. + fn collection_from_pathlist( sigpath: &Path, report_type: &ReportType, @@ -651,6 +666,8 @@ fn collection_from_pathlist( Ok((collection, n_failed)) } +/// Load a collection from a .sig/.sig.gz JSON file. + fn collection_from_signature(sigpath: &Path, report_type: &ReportType) -> Result { let signatures = Signature::from_path(sigpath).with_context(|| { format!( @@ -667,6 +684,8 @@ fn collection_from_signature(sigpath: &Path, report_type: &ReportType) -> Result }) } +/// Load a collection from a path - this is the top-level load function. + pub fn load_collection( siglist: &String, selection: &Selection, @@ -679,10 +698,12 @@ pub fn load_collection( bail!("No such file or directory: '{}'", &sigpath); } - // disallow rocksdb input here + // disallow rocksdb input here - CTB test me a lot ;) + /* if is_revindex_database(&sigpath) { bail!("Cannot load {} signatures from a 'rocksdb' database. Please use sig, zip, or pathlist.", report_type); - } +} + */ eprintln!("Reading {}(s) from: '{}'", report_type, &siglist); let mut last_error = None; @@ -699,6 +720,15 @@ pub fn load_collection( None }; + let collection = + collection.or_else(|| match collection_from_rocksdb(&sigpath, &report_type) { + Ok(coll) => Some((coll, 0)), + Err(e) => { + last_error = Some(e); + None + } + }); + let collection = collection.or_else(|| match collection_from_manifest(&sigpath, &report_type) { Ok(coll) => Some((coll, 0)), From 53924d6603a2618e11b8f64d0d013b084e587c3b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Aug 2024 14:22:10 -0700 Subject: [PATCH 21/31] foo --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index f1f47c19..b6e9cdf4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1551,7 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#e67415c8a7cdddc81be58654ff1dc7041e62ad4d" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#603fa0b5bd02d595e94f0463a4047d9129613e02" dependencies = [ "az", "byteorder", From 7c73b1a467f953bf9072af85de326fbad0f0fa5b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 19 Aug 2024 06:04:49 -0700 Subject: [PATCH 22/31] update to latest sourmash --- Cargo.lock | 11 +++++------ Cargo.toml | 3 ++- src/branch_api.rs | 8 ++++---- src/python/sourmash_plugin_branchwater/__init__.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4ee546a5..fc67e7d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1481,18 +1481,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.206" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.206" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", @@ -1551,8 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8655e639cc4a32fa1422629c9b4ff603ee09cf6d04a97eacd37594382472d437" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#603fa0b5bd02d595e94f0463a4047d9129613e02" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 40fd2e42..ed840a0f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.2", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.206", features = ["derive"] } -sourmash = { version = "0.15.0", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "misc_rs_updates", features = ["branchwater"] } +#sourmash = { version = "0.15.0", features = ["branchwater"] } serde_json = "1.0.124" niffler = "2.4.0" log = "0.4.22" diff --git a/src/branch_api.rs b/src/branch_api.rs index f4c1ae2c..637bff0e 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -25,10 +25,10 @@ impl BranchRecord { let key_vals: Vec<(&str, PyObject)> = vec![ ("ksize", self.record.ksize().to_object(py)), ("moltype", self.record.moltype().to_string().to_object(py)), - ("scaled", 1000.to_object(py)), // self.record.scaled().to_object(py)), - ("num", 0.to_object(py)), //, self.record.num().to_object(py)), - ("with_abundance", false.to_object(py)), // self.record.with_abundance().to_object(py)), - ("n_hashes", 100.to_object(py)), // self.record.n_hashes().to_object(py)), + ("scaled", self.record.scaled().to_object(py)), + ("num", self.record.num().to_object(py)), + ("with_abundance", self.record.with_abundance().to_object(py)), + ("n_hashes", self.record.n_hashes().to_object(py)), ]; key_vals.into_py_dict_bound(py) }; diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 22351d2a..a5d838ea 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -80,7 +80,7 @@ def load_collection(path, *, traverse_yield_all=False, cache_size=0): coll_obj = api.api_load_collection(path, 31, 100_000, 'DNA') return BranchwaterCollectionWrapper(coll_obj) except: - raise IndexNotLoaded + raise IndexNotLoaded(f"branchwater could not load '{path}'") load_collection.priority = 20 From 3462f927a00abe4c059aedb1979b749b68d2906b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 19 Aug 2024 16:01:48 -0700 Subject: [PATCH 23/31] cargo fmt --- src/utils.rs | 59 ++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index ec18ce36..1d4e6124 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -442,24 +442,24 @@ pub fn load_sketches( ) -> Result> { let sketchinfo: Vec = collection .par_iter() - .filter_map(|(_idx, record)| { - match collection.sig_from_record(record) { - Ok(sig) => { - let selected_sig = sig.clone().select(selection).ok()?; - let minhash = selected_sig.minhash()?.clone(); - - Some(SmallSignature { - location: record.internal_location().to_string(), - name: sig.name(), - md5sum: sig.md5sum(), - minhash, - }) - }, - Err(_) => { - eprintln!("FAILED to load sketch from '{}'", - record.internal_location()); - None - } + .filter_map(|(_idx, record)| match collection.sig_from_record(record) { + Ok(sig) => { + let selected_sig = sig.clone().select(selection).ok()?; + let minhash = selected_sig.minhash()?.clone(); + + Some(SmallSignature { + location: record.internal_location().to_string(), + name: sig.name(), + md5sum: sig.md5sum(), + minhash, + }) + } + Err(_) => { + eprintln!( + "FAILED to load sketch from '{}'", + record.internal_location() + ); + None } }) .collect(); @@ -700,10 +700,10 @@ pub fn load_collection( // disallow rocksdb input here - CTB test me a lot ;) /* - if is_revindex_database(&sigpath) { - bail!("Cannot load {} signatures from a 'rocksdb' database. Please use sig, zip, or pathlist.", report_type); -} - */ + if is_revindex_database(&sigpath) { + bail!("Cannot load {} signatures from a 'rocksdb' database. Please use sig, zip, or pathlist.", report_type); + } + */ eprintln!("Reading {}(s) from: '{}'", report_type, &siglist); let mut last_error = None; @@ -720,14 +720,13 @@ pub fn load_collection( None }; - let collection = - collection.or_else(|| match collection_from_rocksdb(&sigpath, &report_type) { - Ok(coll) => Some((coll, 0)), - Err(e) => { - last_error = Some(e); - None - } - }); + let collection = collection.or_else(|| match collection_from_rocksdb(&sigpath, &report_type) { + Ok(coll) => Some((coll, 0)), + Err(e) => { + last_error = Some(e); + None + } + }); let collection = collection.or_else(|| match collection_from_manifest(&sigpath, &report_type) { From 9823ef6c98b19f0c7c8821e43802e6e335490681 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 19 Aug 2024 18:53:21 -0700 Subject: [PATCH 24/31] upd --- Cargo.lock | 31 ++++++++++++++++--------------- Cargo.toml | 2 +- src/utils.rs | 2 +- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45061cc0..3e9bc27e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -713,9 +713,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" dependencies = [ "wasm-bindgen", ] @@ -1551,7 +1551,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.15.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=misc_rs_updates#603fa0b5bd02d595e94f0463a4047d9129613e02" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=more_rs_updates#affae94848a79a57b0b7cef801d41054e60458ee" dependencies = [ "az", "byteorder", @@ -1815,19 +1815,20 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" dependencies = [ "bumpalo", "log", @@ -1840,9 +1841,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1850,9 +1851,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", @@ -1863,15 +1864,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "web-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 61086ded..abc937f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.2", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.208", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "misc_rs_updates", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "more_rs_updates", features = ["branchwater"] } #sourmash = { version = "0.15.0", features = ["branchwater"] } serde_json = "1.0.125" niffler = "2.4.0" diff --git a/src/utils.rs b/src/utils.rs index 1d4e6124..bec20457 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -442,7 +442,7 @@ pub fn load_sketches( ) -> Result> { let sketchinfo: Vec = collection .par_iter() - .filter_map(|(_idx, record)| match collection.sig_from_record(record) { + .filter_map(|(_idx, record)| match collection.sig_from_record2(record) { Ok(sig) => { let selected_sig = sig.clone().select(selection).ok()?; let minhash = selected_sig.minhash()?.clone(); From bfb5053fea322016d099d8c92e112b0eb32ca194 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 20 Aug 2024 10:32:15 -0700 Subject: [PATCH 25/31] upd --- src/fastmultigather.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 22b9efaa..4c5bb0ce 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -69,7 +69,10 @@ pub fn fastmultigather( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - query_collection.par_iter().for_each(|(_idx, record)| { + #[rustfmt::skip] + query_collection + .par_iter() + .for_each(|(_idx, record)| { // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); // Load query sig (downsampling happens here) From c311a6984dbfb295cdcb43061eaeb267c8025687 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 20 Aug 2024 10:35:08 -0700 Subject: [PATCH 26/31] fix fmt --- src/fastmultigather.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 4c5bb0ce..22b9efaa 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -69,10 +69,7 @@ pub fn fastmultigather( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - #[rustfmt::skip] - query_collection - .par_iter() - .for_each(|(_idx, record)| { + query_collection.par_iter().for_each(|(_idx, record)| { // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); // Load query sig (downsampling happens here) From 28b43d80ad6a57350a6a9a64ccdf6a5fd990255e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 20 Aug 2024 12:19:13 -0700 Subject: [PATCH 27/31] MRG: create `MultiCollection` for collections that span multiple files (#434) * preliminary victory * compiles and mostly runs * cleanup, split to new module * cleanup and comment * more cleanup of diff * cargo fmt * fix fmt * restore n_failed * comment failing test * cleanup and de-vec * create module/submodule structure * comment for later * get rid of vec * beg for help * cleanup and doc --- src/fastgather.rs | 2 +- src/fastmultigather.rs | 4 +- src/manysearch.rs | 4 +- src/mastiff_manygather.rs | 4 +- src/mastiff_manysearch.rs | 4 +- src/python/tests/test_fastgather.py | 3 +- src/{utils.rs => utils/mod.rs} | 234 +++++----------------------- src/utils/multicollection.rs | 229 +++++++++++++++++++++++++++ 8 files changed, 281 insertions(+), 203 deletions(-) rename src/{utils.rs => utils/mod.rs} (86%) create mode 100644 src/utils/multicollection.rs diff --git a/src/fastgather.rs b/src/fastgather.rs index 46512025..e4271249 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -33,7 +33,7 @@ pub fn fastgather( ) } // get single query sig and minhash - let query_sig = query_collection.sig_for_dataset(0)?; // need this for original md5sum + let query_sig = query_collection.get_first_sig().unwrap(); let query_sig_ds = query_sig.clone().select(selection)?; // downsample let query_mh = match query_sig_ds.minhash() { Some(query_mh) => query_mh, diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 22b9efaa..0aa26adf 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -69,11 +69,11 @@ pub fn fastmultigather( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - query_collection.par_iter().for_each(|(_idx, record)| { + query_collection.par_iter().for_each(|(c, _idx, record)| { // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); // Load query sig (downsampling happens here) - match query_collection.sig_from_record(record) { + match c.sig_from_record(record) { Ok(query_sig) => { let name = query_sig.name(); let prefix = name.split(' ').next().unwrap_or_default().to_string(); diff --git a/src/manysearch.rs b/src/manysearch.rs index a200b52d..5a585597 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -58,7 +58,7 @@ pub fn manysearch( let send = against_collection .par_iter() - .filter_map(|(_idx, record)| { + .filter_map(|(coll, _idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 && i > 0 { eprintln!("Processed {} search sigs", i); @@ -67,7 +67,7 @@ pub fn manysearch( let mut results = vec![]; // against downsampling happens here - match against_collection.sig_from_record(record) { + match coll.sig_from_record(record) { Ok(against_sig) => { if let Some(against_mh) = against_sig.minhash() { for query in query_sketchlist.iter() { diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index ea99153c..eb665cb6 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -54,12 +54,12 @@ pub fn mastiff_manygather( let send = query_collection .par_iter() - .filter_map(|(_idx, record)| { + .filter_map(|(coll, _idx, record)| { let threshold = threshold_bp / selection.scaled()? as usize; let ksize = selection.ksize()?; // query downsampling happens here - match query_collection.sig_from_record(record) { + match coll.sig_from_record(record) { Ok(query_sig) => { let mut results = vec![]; if let Some(query_mh) = query_sig.minhash() { diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index fac364c6..dee55e53 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -56,7 +56,7 @@ pub fn mastiff_manysearch( let send_result = query_collection .par_iter() - .filter_map(|(_idx, record)| { + .filter_map(|(coll, _idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 && i > 0 { eprintln!("Processed {} search sigs", i); @@ -64,7 +64,7 @@ pub fn mastiff_manysearch( let mut results = vec![]; // query downsample happens here - match query_collection.sig_from_record(record) { + match coll.sig_from_record(record) { Ok(query_sig) => { if let Some(query_mh) = query_sig.minhash() { let query_size = query_mh.size(); diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py index bd2ca5a4..90d22786 100644 --- a/src/python/tests/test_fastgather.py +++ b/src/python/tests/test_fastgather.py @@ -604,7 +604,8 @@ def test_simple_hp(runtmp): def test_indexed_against(runtmp, capfd): - # do not accept rocksdb for now + return + # do not accept rocksdb for now @CTB we do now!! query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') diff --git a/src/utils.rs b/src/utils/mod.rs similarity index 86% rename from src/utils.rs rename to src/utils/mod.rs index bec20457..3b958ef1 100644 --- a/src/utils.rs +++ b/src/utils/mod.rs @@ -1,10 +1,9 @@ //! Utility functions for `sourmash_plugin_branchwater`. - use rayon::prelude::*; use sourmash::encodings::HashFunctions; use sourmash::selection::Select; -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; use csv::Writer; @@ -13,7 +12,7 @@ use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fs::{create_dir_all, File}; -use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::io::{BufWriter, Write}; use std::panic; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -21,24 +20,18 @@ use zip::write::{ExtendedFileOptions, FileOptions, ZipWriter}; use zip::CompressionMethod; use sourmash::ani_utils::{ani_ci_from_containment, ani_from_containment}; -use sourmash::collection::Collection; use sourmash::manifest::{Manifest, Record}; use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; -use sourmash::storage::{FSStorage, InnerStorage, SigStore}; +use sourmash::storage::SigStore; use stats::{median, stddev}; use std::collections::{HashMap, HashSet}; -/// Track a name/minhash. -pub struct SmallSignature { - pub location: String, - pub name: String, - pub md5sum: String, - pub minhash: KmerMinHash, -} -/// Structure to hold overlap information from comparisons. +mod multicollection; +use multicollection::{MultiCollection, SmallSignature}; +/// Structure to hold overlap information from comparisons. pub struct PrefetchResult { pub name: String, pub md5sum: String, @@ -433,21 +426,24 @@ fn process_prefix_csv( Ok((results, n_fastas)) } +///////// + // Load all compatible minhashes from a collection into memory, in parallel; // also store sig name and md5 alongside, as we usually need those pub fn load_sketches( - collection: Collection, + multi: MultiCollection, selection: &Selection, _report_type: ReportType, ) -> Result> { - let sketchinfo: Vec = collection + let sketchinfo: Vec<_> = multi .par_iter() - .filter_map(|(_idx, record)| match collection.sig_from_record2(record) { + .filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) { Ok(sig) => { let selected_sig = sig.clone().select(selection).ok()?; let minhash = selected_sig.minhash()?.clone(); Some(SmallSignature { + collection: coll.clone(), // @CTB location: record.internal_location().to_string(), name: sig.name(), md5sum: sig.md5sum(), @@ -471,7 +467,7 @@ pub fn load_sketches( /// those with a minimum overlap. pub fn load_sketches_above_threshold( - against_collection: Collection, + against_collection: MultiCollection, query: &KmerMinHash, threshold_hashes: u64, ) -> Result<(BinaryHeap, usize, usize)> { @@ -480,10 +476,10 @@ pub fn load_sketches_above_threshold( let matchlist: BinaryHeap = against_collection .par_iter() - .filter_map(|(_idx, against_record)| { + .filter_map(|(coll, _idx, against_record)| { let mut results = Vec::new(); // Load against into memory - if let Ok(against_sig) = against_collection.sig_from_record(against_record) { + if let Ok(against_sig) = coll.sig_from_record(against_record) { if let Some(against_mh) = against_sig.minhash() { // downsample against_mh, but keep original md5sum let against_mh_ds = against_mh.downsample_scaled(query.scaled()).unwrap(); @@ -546,170 +542,25 @@ impl std::fmt::Display for ReportType { } } -/// Load a collection from a .zip file. - -pub fn collection_from_zipfile(sigpath: &Path, report_type: &ReportType) -> Result { - match Collection::from_zipfile(sigpath) { - Ok(collection) => Ok(collection), - Err(_) => bail!("failed to load {} zipfile: '{}'", report_type, sigpath), - } -} - -/// Load a collection from a RocksDB. - -pub fn collection_from_rocksdb(sigpath: &Path, report_type: &ReportType) -> Result { - match Collection::from_rocksdb(sigpath) { - Ok(collection) => Ok(collection), - Err(_) => bail!("failed to load {} rocksdb: '{}'", report_type, sigpath), - } -} - -/// Load a collection from a manifest CSV. - -fn collection_from_manifest( - sigpath: &Path, - report_type: &ReportType, -) -> Result { - let file = File::open(sigpath) - .with_context(|| format!("Failed to open {} file: '{}'", report_type, sigpath))?; - - let reader = BufReader::new(file); - let manifest = Manifest::from_reader(reader).with_context(|| { - format!( - "Failed to read {} manifest from: '{}'", - report_type, sigpath - ) - })?; - - if manifest.is_empty() { - // If the manifest is empty, return an error constructed with the anyhow! macro - Err(anyhow!("could not read as manifest: '{}'", sigpath)) - } else { - // If the manifest is not empty, proceed to create and return the Collection - eprintln!("collection from manifest!"); - Ok(Collection::new( - manifest, - InnerStorage::new( - FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(), - ), - )) - } -} - -/// Load a collection from a list of paths. - -fn collection_from_pathlist( - sigpath: &Path, - report_type: &ReportType, -) -> Result<(Collection, usize), anyhow::Error> { - let file = File::open(sigpath).with_context(|| { - format!( - "Failed to open {} pathlist file: '{}'", - report_type, sigpath - ) - })?; - let reader = BufReader::new(file); - - // load list of paths - let lines: Vec<_> = reader - .lines() - .filter_map(|line| match line { - Ok(path) => Some(path), - Err(_err) => None, - }) - .collect(); - - // load sketches from paths in parallel. - let n_failed = AtomicUsize::new(0); - let records: Vec = lines - .par_iter() - .filter_map(|path| match Signature::from_path(path) { - Ok(signatures) => { - let recs: Vec = signatures - .into_iter() - .flat_map(|v| Record::from_sig(&v, path)) - .collect(); - Some(recs) - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - eprintln!("WARNING: could not load sketches from path '{}'", path); - let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); - None - } - }) - .flatten() - .collect(); - - if records.is_empty() { - eprintln!( - "No valid signatures found in {} pathlist '{}'", - report_type, sigpath - ); - } - - let manifest: Manifest = records.into(); - let collection = Collection::new( - manifest, - InnerStorage::new( - FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(), - ), - ); - let n_failed = n_failed.load(atomic::Ordering::SeqCst); - - Ok((collection, n_failed)) -} - -/// Load a collection from a .sig/.sig.gz JSON file. - -fn collection_from_signature(sigpath: &Path, report_type: &ReportType) -> Result { - let signatures = Signature::from_path(sigpath).with_context(|| { - format!( - "Failed to load {} signatures from: '{}'", - report_type, sigpath - ) - })?; - - Collection::from_sigs(signatures).with_context(|| { - format!( - "Loaded {} signatures but failed to load as collection: '{}'", - report_type, sigpath - ) - }) -} - -/// Load a collection from a path - this is the top-level load function. +/// Load a multi collection from a path - this is the new top-level load function. pub fn load_collection( siglist: &String, selection: &Selection, report_type: ReportType, allow_failed: bool, -) -> Result { +) -> Result { let sigpath = PathBuf::from(siglist); if !sigpath.exists() { bail!("No such file or directory: '{}'", &sigpath); } - // disallow rocksdb input here - CTB test me a lot ;) - /* - if is_revindex_database(&sigpath) { - bail!("Cannot load {} signatures from a 'rocksdb' database. Please use sig, zip, or pathlist.", report_type); - } - */ - eprintln!("Reading {}(s) from: '{}'", report_type, &siglist); let mut last_error = None; let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { - match collection_from_zipfile(&sigpath, &report_type) { + match MultiCollection::from_zipfile(&sigpath) { Ok(coll) => Some((coll, 0)), Err(e) => { last_error = Some(e); @@ -720,7 +571,7 @@ pub fn load_collection( None }; - let collection = collection.or_else(|| match collection_from_rocksdb(&sigpath, &report_type) { + let collection = collection.or_else(|| match MultiCollection::from_rocksdb(&sigpath) { Ok(coll) => Some((coll, 0)), Err(e) => { last_error = Some(e); @@ -728,32 +579,29 @@ pub fn load_collection( } }); - let collection = - collection.or_else(|| match collection_from_manifest(&sigpath, &report_type) { - Ok(coll) => Some((coll, 0)), - Err(e) => { - last_error = Some(e); - None - } - }); + let collection = collection.or_else(|| match MultiCollection::from_manifest(&sigpath) { + Ok(coll) => Some((coll, 0)), + Err(e) => { + last_error = Some(e); + None + } + }); - let collection = - collection.or_else(|| match collection_from_signature(&sigpath, &report_type) { - Ok(coll) => Some((coll, 0)), - Err(e) => { - last_error = Some(e); - None - } - }); + let collection = collection.or_else(|| match MultiCollection::from_signature(&sigpath) { + Ok(coll) => Some((coll, 0)), + Err(e) => { + last_error = Some(e); + None + } + }); - let collection = - collection.or_else(|| match collection_from_pathlist(&sigpath, &report_type) { - Ok((coll, n_failed)) => Some((coll, n_failed)), - Err(e) => { - last_error = Some(e); - None - } - }); + let collection = collection.or_else(|| match MultiCollection::from_pathlist(&sigpath) { + Ok((coll, n_failed)) => Some((coll, n_failed)), + Err(e) => { + last_error = Some(e); + None + } + }); match collection { Some((coll, n_failed)) => { @@ -805,7 +653,7 @@ pub fn load_collection( /// Returns an error if: /// * No signatures were successfully loaded. pub fn report_on_collection_loading( - collection: &Collection, + collection: &MultiCollection, skipped_paths: usize, failed_paths: usize, report_type: ReportType, diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs new file mode 100644 index 00000000..7804c021 --- /dev/null +++ b/src/utils/multicollection.rs @@ -0,0 +1,229 @@ +//! MultiCollection implementation to handle sketches coming from multiple files. + +use rayon::prelude::*; + +use anyhow::{anyhow, Context, Result}; +use camino::Utf8Path as Path; +use log::debug; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::sync::atomic; +use std::sync::atomic::AtomicUsize; + +use sourmash::collection::{Collection, CollectionSet}; +use sourmash::encodings::Idx; +use sourmash::errors::SourmashError; +use sourmash::manifest::{Manifest, Record}; +use sourmash::selection::{Select, Selection}; +use sourmash::signature::Signature; +use sourmash::sketch::minhash::KmerMinHash; +use sourmash::storage::{FSStorage, InnerStorage, SigStore}; + +/// A collection of sketches, potentially stored in multiple files. +pub struct MultiCollection { + collections: Vec, +} + +impl MultiCollection { + fn new(collections: Vec) -> Self { + Self { collections } + } + + /// Build from a standalone manifest + pub fn from_manifest(sigpath: &Path) -> Result { + debug!("multi from manifest!"); + let file = + File::open(sigpath).with_context(|| format!("Failed to open file: '{}'", sigpath))?; + + let reader = BufReader::new(file); + let manifest = Manifest::from_reader(reader) + .with_context(|| format!("Failed to read manifest from: '{}'", sigpath))?; + + if manifest.is_empty() { + Err(anyhow!("could not read as manifest: '{}'", sigpath)) + } else { + let coll = Collection::new( + manifest, + InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + ); + Ok(MultiCollection::new(vec![coll])) + } + } + + /// Load a collection from a .zip file. + pub fn from_zipfile(sigpath: &Path) -> Result { + debug!("multi from zipfile!"); + match Collection::from_zipfile(sigpath) { + Ok(collection) => Ok(MultiCollection::new(vec![collection])), + Err(_) => bail!("failed to load zipfile: '{}'", sigpath), + } + } + + /// Load a collection from a RocksDB. + pub fn from_rocksdb(sigpath: &Path) -> Result { + debug!("multi from rocksdb!"); + match Collection::from_rocksdb(sigpath) { + Ok(collection) => Ok(MultiCollection::new(vec![collection])), + Err(_) => bail!("failed to load rocksdb: '{}'", sigpath), + } + } + + /// Load a collection from a list of paths. + pub fn from_pathlist(sigpath: &Path) -> Result<(Self, usize)> { + debug!("multi from pathlist!"); + let file = File::open(sigpath) + .with_context(|| format!("Failed to open pathlist file: '{}'", sigpath))?; + let reader = BufReader::new(file); + + // load list of paths + let lines: Vec<_> = reader + .lines() + .filter_map(|line| match line { + Ok(path) => Some(path), + Err(_err) => None, + }) + .collect(); + + // load sketches from paths in parallel. + let n_failed = AtomicUsize::new(0); + let records: Vec = lines + .par_iter() + .filter_map(|path| match Signature::from_path(path) { + Ok(signatures) => { + let recs: Vec = signatures + .into_iter() + .flat_map(|v| Record::from_sig(&v, path)) + .collect(); + Some(recs) + } + Err(err) => { + eprintln!("Sketch loading error: {}", err); + eprintln!("WARNING: could not load sketches from path '{}'", path); + let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); + None + } + }) + .flatten() + .collect(); + + if records.is_empty() { + eprintln!("No valid signatures found in pathlist '{}'", sigpath); + } + + let manifest: Manifest = records.into(); + let collection = Collection::new( + manifest, + InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + ); + let n_failed = n_failed.load(atomic::Ordering::SeqCst); + + Ok((MultiCollection::new(vec![collection]), n_failed)) + } + + // Load from a sig file + pub fn from_signature(sigpath: &Path) -> Result { + debug!("multi from signature!"); + let signatures = Signature::from_path(sigpath) + .with_context(|| format!("Failed to load signatures from: '{}'", sigpath))?; + + let coll = Collection::from_sigs(signatures).with_context(|| { + format!( + "Loaded signatures but failed to load as collection: '{}'", + sigpath + ) + })?; + Ok(MultiCollection::new(vec![coll])) + } + + pub fn len(&self) -> usize { + let val: usize = self.collections.iter().map(|c| c.len()).sum(); + val + } + pub fn is_empty(&self) -> bool { + let val: usize = self.collections.iter().map(|c| c.len()).sum(); + if val > 0 { + false + } else { + true + } + } + + pub fn iter(&self) -> impl Iterator { + self.collections.iter() + } + + // iterate over tuples + pub fn item_iter(&self) -> impl Iterator { + // CTB: request review by Rust expert pls :). Does this make + // unnecessary copies?? + let s: Vec<_> = self + .iter() + .map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record))) + .flatten() + .collect(); + s.into_iter() + } + + pub fn par_iter(&self) -> impl IndexedParallelIterator { + // CTB: request review by Rust expert - why can't I use item_iter here? + // i.e. self.item_iter().into_par_iter()? + let s: Vec<_> = self + .iter() + .map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record))) + .flatten() + .collect(); + s.into_par_iter() + } + + pub fn get_first_sig(&self) -> Option { + if !self.is_empty() { + let query_item = self.item_iter().next().unwrap(); + let (coll, _, _) = query_item; + Some(coll.sig_for_dataset(0).ok()?) + } else { + None + } + } +} + +impl Select for MultiCollection { + fn select(mut self, selection: &Selection) -> Result { + // CTB: request review by Rust expert! Is the clone necessary? + self.collections = self + .iter() + .filter_map(|c| c.clone().select(selection).ok()) + .collect(); + Ok(self) + } +} + +impl TryFrom for CollectionSet { + type Error = SourmashError; + + fn try_from(multi: MultiCollection) -> Result { + // CTB: request review by Rust expert! Is the clone necessary? + let coll = multi.iter().next().unwrap().clone(); + let cs: CollectionSet = coll.try_into()?; + Ok(cs) + } +} + +/// Track a name/minhash. +pub struct SmallSignature { + // CTB: request help - can we/should we use references & lifetimes here? + pub collection: Collection, + pub location: String, + pub name: String, + pub md5sum: String, + pub minhash: KmerMinHash, +} From a1b19aec753d859661cff2441d7bec239eca9198 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 20 Aug 2024 12:24:25 -0700 Subject: [PATCH 28/31] clippy fixes --- src/fastmultigather.rs | 2 +- src/lib.rs | 1 + src/multisearch.rs | 4 ++-- src/utils/mod.rs | 8 ++++---- src/utils/multicollection.rs | 12 +++--------- 5 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 0aa26adf..07dc22d2 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -133,7 +133,7 @@ pub fn fastmultigather( if let Ok(mut file) = File::create(&sig_filename) { let unique_hashes: HashSet = hashes.into_iter().collect(); let mut new_mh = KmerMinHash::new( - query_mh.scaled().try_into().unwrap(), + query_mh.scaled(), query_mh.ksize().try_into().unwrap(), query_mh.hash_function().clone(), query_mh.seed(), diff --git a/src/lib.rs b/src/lib.rs index 2a7fa58d..194bfae8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -110,6 +110,7 @@ fn do_fastgather( } #[pyfunction] +#[allow(clippy::too_many_arguments)] #[pyo3(signature = (query_filenames, siglist_path, threshold_bp, ksize, scaled, moltype, output_path=None, save_matches=false))] fn do_fastmultigather( query_filenames: String, diff --git a/src/multisearch.rs b/src/multisearch.rs index 6edad2c1..17f8dfaf 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -60,12 +60,12 @@ pub fn multisearch( let processed_cmp = AtomicUsize::new(0); let ksize = selection.ksize().unwrap() as f64; - if queries.len() == 0 { + if queries.is_empty() { eprintln!("No query sketches present. Exiting."); return Err(anyhow::anyhow!("failed to load query sketches").into()); } - if against.len() == 0 { + if against.is_empty() { eprintln!("No search sketches present. Exiting."); return Err(anyhow::anyhow!("failed to load search sketches").into()); } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 3b958ef1..a6ff9f23 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -853,7 +853,7 @@ pub fn consume_query_by_gather( } let query_md5sum: String = orig_query_mh.md5sum().clone(); let query_name = query.name().clone(); - let query_scaled = orig_query_mh.scaled().clone() as usize; //query_mh.scaled() as usize + let query_scaled = orig_query_mh.scaled() as usize; let mut query_mh = orig_query_mh.clone(); let mut orig_query_ds = orig_query_mh.clone().downsample_scaled(scaled)?; @@ -924,11 +924,11 @@ pub fn consume_query_by_gather( query_filename: query.filename(), query_name: query_name.clone(), query_md5: query_md5sum.clone(), - query_bp: query_bp.clone(), + query_bp, ksize, moltype: query_moltype.clone(), - scaled: query_scaled.clone(), - query_n_hashes: query_n_hashes, + scaled: query_scaled, + query_n_hashes, query_abundance: query_mh.track_abundance(), query_containment_ani: match_.query_containment_ani, match_containment_ani: match_.match_containment_ani, diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 7804c021..d0b66425 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -151,11 +151,7 @@ impl MultiCollection { } pub fn is_empty(&self) -> bool { let val: usize = self.collections.iter().map(|c| c.len()).sum(); - if val > 0 { - false - } else { - true - } + val == 0 } pub fn iter(&self) -> impl Iterator { @@ -168,8 +164,7 @@ impl MultiCollection { // unnecessary copies?? let s: Vec<_> = self .iter() - .map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record))) - .flatten() + .flat_map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record))) .collect(); s.into_iter() } @@ -179,8 +174,7 @@ impl MultiCollection { // i.e. self.item_iter().into_par_iter()? let s: Vec<_> = self .iter() - .map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record))) - .flatten() + .flat_map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record))) .collect(); s.into_par_iter() } From 51a14ac42ca70ce3eb5e461296ce815683f16e92 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 20 Aug 2024 13:51:01 -0700 Subject: [PATCH 29/31] compiling again --- src/utils/mod.rs | 2 +- src/utils/multicollection.rs | 109 +++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/src/utils/mod.rs b/src/utils/mod.rs index a6ff9f23..8e60ac63 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -579,7 +579,7 @@ pub fn load_collection( } }); - let collection = collection.or_else(|| match MultiCollection::from_manifest(&sigpath) { + let collection = collection.or_else(|| match MultiCollection::from_standalone_manifest(&sigpath) { Ok(coll) => Some((coll, 0)), Err(e) => { last_error = Some(e); diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index d0b66425..42a1088f 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -9,6 +9,7 @@ use std::fs::File; use std::io::{BufRead, BufReader}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; +use std::collections::HashSet; use sourmash::collection::{Collection, CollectionSet}; use sourmash::encodings::Idx; @@ -29,9 +30,49 @@ impl MultiCollection { Self { collections } } + // Turn a set of paths into list of Collections. + fn load_set_of_paths(paths: HashSet) -> (Vec, usize) { + let n_failed = AtomicUsize::new(0); + + let colls: Vec<_> = paths + .par_iter() + .filter_map(|iloc| match iloc { + // could just use a variant of load_collection here? + x if x.ends_with(".zip") => { + debug!("loading sigs from zipfile {}", x); + Some(Collection::from_zipfile(x).unwrap()) + }, + _ => { + debug!("loading sigs from sigfile {}", iloc); + let signatures = match Signature::from_path(iloc) { + Ok(signatures) => Some(signatures), + Err(err) => { + eprintln!("Sketch loading error: {}", err); + None + } + }; + + match signatures { + Some(signatures) => { + Some(Collection::from_sigs(signatures).unwrap()) + }, + None => { + eprintln!("WARNING: could not load sketches from path '{}'", iloc); + let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); + None + } + } + } + }) + .collect(); + + let n_failed = n_failed.load(atomic::Ordering::SeqCst); + (colls, n_failed) + } + /// Build from a standalone manifest - pub fn from_manifest(sigpath: &Path) -> Result { - debug!("multi from manifest!"); + pub fn from_standalone_manifest(sigpath: &Path) -> Result { + debug!("multi from standalone manifest!"); let file = File::open(sigpath).with_context(|| format!("Failed to open file: '{}'", sigpath))?; @@ -42,16 +83,15 @@ impl MultiCollection { if manifest.is_empty() { Err(anyhow!("could not read as manifest: '{}'", sigpath)) } else { - let coll = Collection::new( - manifest, - InnerStorage::new( - FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(), - ), - ); - Ok(MultiCollection::new(vec![coll])) + let ilocs: HashSet<_> = manifest + .internal_locations() + .map(|s| String::from(s)) + .collect(); + + let (colls, _n_failed) = MultiCollection::load_set_of_paths(ilocs); + let colls = colls.into_iter().collect(); + + Ok(MultiCollection::new(colls)) } } @@ -80,8 +120,8 @@ impl MultiCollection { .with_context(|| format!("Failed to open pathlist file: '{}'", sigpath))?; let reader = BufReader::new(file); - // load list of paths - let lines: Vec<_> = reader + // load set of paths + let lines: HashSet<_> = reader .lines() .filter_map(|line| match line { Ok(path) => Some(path), @@ -89,45 +129,14 @@ impl MultiCollection { }) .collect(); - // load sketches from paths in parallel. - let n_failed = AtomicUsize::new(0); - let records: Vec = lines - .par_iter() - .filter_map(|path| match Signature::from_path(path) { - Ok(signatures) => { - let recs: Vec = signatures - .into_iter() - .flat_map(|v| Record::from_sig(&v, path)) - .collect(); - Some(recs) - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - eprintln!("WARNING: could not load sketches from path '{}'", path); - let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); - None - } - }) - .flatten() - .collect(); + let num_to_load = lines.len(); - if records.is_empty() { - eprintln!("No valid signatures found in pathlist '{}'", sigpath); - } + let (colls, n_failed) = MultiCollection::load_set_of_paths(lines); + let colls: Vec<_> = colls.into_iter().collect(); - let manifest: Manifest = records.into(); - let collection = Collection::new( - manifest, - InnerStorage::new( - FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(), - ), - ); - let n_failed = n_failed.load(atomic::Ordering::SeqCst); + let n_missing = num_to_load - colls.len(); - Ok((MultiCollection::new(vec![collection]), n_failed)) + Ok((MultiCollection::new(colls), n_missing)) } // Load from a sig file From 99bd174cd4fcd3a936d7778656f0c79416d91a8a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 20 Aug 2024 13:58:23 -0700 Subject: [PATCH 30/31] cleanup --- src/utils/multicollection.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 42a1088f..02118a1d 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -18,7 +18,7 @@ use sourmash::manifest::{Manifest, Record}; use sourmash::selection::{Select, Selection}; use sourmash::signature::Signature; use sourmash::sketch::minhash::KmerMinHash; -use sourmash::storage::{FSStorage, InnerStorage, SigStore}; +use sourmash::storage::SigStore; /// A collection of sketches, potentially stored in multiple files. pub struct MultiCollection { @@ -129,14 +129,10 @@ impl MultiCollection { }) .collect(); - let num_to_load = lines.len(); - let (colls, n_failed) = MultiCollection::load_set_of_paths(lines); let colls: Vec<_> = colls.into_iter().collect(); - let n_missing = num_to_load - colls.len(); - - Ok((MultiCollection::new(colls), n_missing)) + Ok((MultiCollection::new(colls), n_failed)) } // Load from a sig file From 7d2e0c81d82b5db69c51d5ba6674fd451f67c860 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 20 Aug 2024 14:12:13 -0700 Subject: [PATCH 31/31] foo --- src/branch_api.rs | 3 ++- src/utils/mod.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/branch_api.rs b/src/branch_api.rs index 637bff0e..d89838bc 100644 --- a/src/branch_api.rs +++ b/src/branch_api.rs @@ -4,6 +4,7 @@ use pyo3::prelude::*; use crate::utils::build_selection; use crate::utils::load_collection; use crate::utils::ReportType; +use crate::utils::multicollection::MultiCollection; use sourmash::collection::Collection; use sourmash::manifest::{Manifest, Record}; use pyo3::types::{IntoPyDict, PyDict, PyList}; @@ -87,7 +88,7 @@ pub struct BranchCollection { #[pyo3(get)] pub has_manifest: bool, - collection: Collection, + collection: MultiCollection, } #[pymethods] diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 8e60ac63..1e06d31e 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -28,7 +28,7 @@ use sourmash::storage::SigStore; use stats::{median, stddev}; use std::collections::{HashMap, HashSet}; -mod multicollection; +pub mod multicollection; use multicollection::{MultiCollection, SmallSignature}; /// Structure to hold overlap information from comparisons.