diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 290ed255..6acfe902 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -3,27 +3,28 @@ import argparse from sourmash.plugins import CommandLinePlugin from sourmash.logging import notify -from sourmash.exceptions import IndexNotLoaded import os import importlib.metadata from . import sourmash_plugin_branchwater -from . import sourmash_plugin_branchwater as api from . import prettyprint __version__ = importlib.metadata.version("sourmash_plugin_branchwater") + def print_version(): - notify(f"=> sourmash_plugin_branchwater {__version__}; cite Irber et al., doi: 10.1101/2022.11.02.514947\n") + notify( + f"=> sourmash_plugin_branchwater {__version__}; cite Irber et al., doi: 10.1101/2022.11.02.514947\n" + ) def get_max_cores(): try: - if 'SLURM_CPUS_ON_NODE' in os.environ: - return int(os.environ['SLURM_CPUS_ON_NODE']) - elif 'SLURM_JOB_CPUS_PER_NODE' in os.environ: - cpus_per_node_str = os.environ['SLURM_JOB_CPUS_PER_NODE'] - return int(cpus_per_node_str.split('x')[0]) + if "SLURM_CPUS_ON_NODE" in os.environ: + return int(os.environ["SLURM_CPUS_ON_NODE"]) + elif "SLURM_JOB_CPUS_PER_NODE" in os.environ: + cpus_per_node_str = os.environ["SLURM_JOB_CPUS_PER_NODE"] + return int(cpus_per_node_str.split("x")[0]) else: return os.cpu_count() except Exception: @@ -34,100 +35,108 @@ def set_thread_pool(user_cores): avail_threads = get_max_cores() num_threads = min(avail_threads, user_cores) if user_cores else avail_threads if user_cores and user_cores > avail_threads: - notify(f"warning: only {avail_threads} threads available, using {avail_threads}") + notify( + f"warning: only {avail_threads} threads available, using {avail_threads}" + ) actual_rayon_cores = sourmash_plugin_branchwater.set_global_thread_pool(num_threads) return actual_rayon_cores -class BranchwaterManifestWrapper: - def __init__(self, mf_obj): - self.obj = mf_obj - - def _check_row_values(self): - return self.obj._check_row_values() - - @property - def rows(self): - return self.obj.rows - - -class BranchwaterCollectionWrapper: - def __init__(self, coll_obj): - self.obj = coll_obj - - @property - def location(self): - return self.obj.location - - @property - def is_database(self): - return self.obj.is_database - - @property - def has_manifest(self): - return self.obj.has_manifest - - @property - def manifest(self): - return BranchwaterManifestWrapper(self.obj.manifest) - - def __len__(self): - return len(self.obj) - - -def load_collection(path, *, traverse_yield_all=False, cache_size=0): - try: - coll_obj = api.api_load_collection(path, 31, 100_000, 'DNA') - return BranchwaterCollectionWrapper(coll_obj) - except: - raise IndexNotLoaded(f"branchwater could not load '{path}'") -load_collection.priority = 20 - - class Branchwater_Manysearch(CommandLinePlugin): - command = 'manysearch' - description = 'search many metagenomes for contained genomes' + command = "manysearch" + description = "search many metagenomes for contained genomes" def __init__(self, p): super().__init__(p) - p.add_argument('query_paths', - help="input file of sketches") - p.add_argument('against_paths', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='CSV output file for matches') - p.add_argument('-t', '--threshold', default=0.01, type=float, - help='containment threshold for reporting matches (default: 0.01)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-P', '--pretty-print', action='store_true', - default=True, - help="display results after search finishes (default: True)") - p.add_argument('-N', '--no-pretty-print', action='store_false', - dest='pretty_print', - help="do not display results (e.g. for large output)") + p.add_argument("query_paths", help="input file of sketches") + p.add_argument("against_paths", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="CSV output file for matches" + ) + p.add_argument( + "-t", + "--threshold", + default=0.01, + type=float, + help="containment threshold for reporting matches (default: 0.01)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to do comparisons", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-P", + "--pretty-print", + action="store_true", + default=True, + help="display results after search finishes (default: True)", + ) + p.add_argument( + "-N", + "--no-pretty-print", + action="store_false", + dest="pretty_print", + help="do not display results (e.g. for large output)", + ) + p.add_argument( + "--ignore-abundance", + action="store_true", + help="do not do expensive abundance calculations", + ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons; against a RocksDB database, this will only output comparisons with some overlap", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}" + ) num_threads = set_thread_pool(args.cores) - notify(f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_manysearch(args.query_paths, - args.against_paths, - args.threshold, - args.ksize, - args.scaled, - args.moltype, - args.output) + status = sourmash_plugin_branchwater.do_manysearch( + args.query_paths, + args.against_paths, + args.threshold, + args.ksize, + args.scaled, + args.moltype, + args.output, + args.ignore_abundance, + args.output_all_comparisons, + ) if status == 0: notify(f"...manysearch is done! results in '{args.output}'") @@ -137,46 +146,80 @@ def main(self, args): class Branchwater_Fastgather(CommandLinePlugin): - command = 'fastgather' - description = 'massively parallel sketch gather' + command = "fastgather" + description = "massively parallel sketch gather" def __init__(self, p): super().__init__(p) - p.add_argument('query_sig', help="metagenome sketch") - p.add_argument('against_paths', help="input file of sketches") - p.add_argument('-o', '--output-gather', required=True, - help="save gather output (minimum metagenome cover) to this file") - p.add_argument('--output-prefetch', - help="save prefetch output (all overlaps) to this file") - p.add_argument('-t', '--threshold-bp', default=50000, type=float, - help='threshold in estimated base pairs, for reporting matches (default: 50kb)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to do comparisons (default: 31)') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons (default: 1000)') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - + p.add_argument("query_sig", help="metagenome sketch") + p.add_argument("against_paths", help="input file of sketches") + p.add_argument( + "-o", + "--output-gather", + required=True, + help="save gather output (minimum metagenome cover) to this file", + ) + p.add_argument( + "--output-prefetch", help="save prefetch output (all overlaps) to this file" + ) + p.add_argument( + "-t", + "--threshold-bp", + default=50000, + type=float, + help="threshold in estimated base pairs, for reporting matches (default: 50kb)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to do comparisons (default: 31)", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to do comparisons (default: determined from query)", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp}" + ) num_threads = set_thread_pool(args.cores) - - notify(f"gathering all sketches in '{args.query_sig}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"gathering all sketches in '{args.query_sig}' against '{args.against_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_fastgather(args.query_sig, - args.against_paths, - int(args.threshold_bp), - args.ksize, - args.scaled, - args.moltype, - args.output_gather, - args.output_prefetch) + status = sourmash_plugin_branchwater.do_fastgather( + args.query_sig, + args.against_paths, + int(args.threshold_bp), + args.ksize, + args.scaled, + args.moltype, + args.output_gather, + args.output_prefetch, + ) if status == 0: notify(f"...fastgather is done! gather results in '{args.output_gather}'") if args.output_prefetch: @@ -185,103 +228,182 @@ def main(self, args): class Branchwater_Fastmultigather(CommandLinePlugin): - command = 'fastmultigather' - description = 'massively parallel sketch multigather' + command = "fastmultigather" + description = "massively parallel sketch multigather" def __init__(self, p): super().__init__(p) - p.add_argument('query_paths', help="input file of sketches to query") - p.add_argument('against_paths', help="input file of sketches to search against \ - OR a branchwater indexed database generated with 'sourmash scripts index'") - p.add_argument('-t', '--threshold-bp', default=50000, type=float, - help='threshold in estimated base pairs, for reporting matches (default: 50kb)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to do comparisons (default: 31)') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons (default: 1000)') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-o', '--output', help='CSV output file for matches') - p.add_argument('--save-matches', action='store_true', default=False, help='save matched hashes for every input to a signature') - + p.add_argument("query_paths", help="input file of sketches to query") + p.add_argument( + "against_paths", + help="input file of sketches to search against \ + OR a branchwater indexed database generated with 'sourmash scripts index'", + ) + p.add_argument( + "-t", + "--threshold-bp", + default=50000, + type=float, + help="threshold in estimated base pairs, for reporting matches (default: 50kb)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to do comparisons (default: 31)", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to do comparisons (default: determined from query collection)", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-o", + "--output", + help="CSV output file for matches. Used for non-rocksdb searches only.", + ) + p.add_argument( + "--create-empty-results", + action="store_true", + default=False, + help="create empty results file(s) even if no matches", + ) + p.add_argument( + "--save-matches", + action="store_true", + default=False, + help="save matched hashes for every input to a signature", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp} / save matches: {args.save_matches}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp} / save matches: {args.save_matches}" + ) num_threads = set_thread_pool(args.cores) - notify(f"gathering all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"gathering all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_fastmultigather(args.query_paths, - args.against_paths, - int(args.threshold_bp), - args.ksize, - args.scaled, - args.moltype, - args.output, - args.save_matches - ) + status = sourmash_plugin_branchwater.do_fastmultigather( + args.query_paths, + args.against_paths, + int(args.threshold_bp), + args.ksize, + args.scaled, + args.moltype, + args.output, + args.save_matches, + args.create_empty_results, + ) if status == 0: notify(f"...fastmultigather is done!") return status class Branchwater_Index(CommandLinePlugin): - command = 'index' - description = 'Build Branchwater RevIndex' + command = "index" + description = "Build Branchwater RevIndex" def __init__(self, p): super().__init__(p) - p.add_argument('siglist', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='output file for the index') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('--internal-storage', default=True, action='store_true', - help="build indexes that contain sketches and are relocatable (default: True)") - p.add_argument('--no-internal-storage', '--no-store-sketches', - action='store_false', - help="do not store sketches in the index; index may not be relocatable (default: False)", - dest='internal_storage') + p.add_argument("siglist", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="output file for the index" + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to select sketches (default: chosen as max from collection)", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "--internal-storage", + default=True, + action="store_true", + help="build indexes that contain sketches and are relocatable (default: True)", + ) + p.add_argument( + "--no-internal-storage", + "--no-store-sketches", + action="store_false", + help="do not store sketches in the index; index may not be relocatable (default: False)", + dest="internal_storage", + ) def main(self, args): - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} ") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} " + ) num_threads = set_thread_pool(args.cores) notify(f"indexing all sketches in '{args.siglist}'") super().main(args) - status = sourmash_plugin_branchwater.do_index(args.siglist, - args.ksize, - args.scaled, - args.moltype, - args.output, - False, # colors - currently must be false? - args.internal_storage) + status = sourmash_plugin_branchwater.do_index( + args.siglist, + args.ksize, + args.scaled, + args.moltype, + args.output, + False, # colors - currently must be false? + args.internal_storage, + ) if status == 0: notify(f"...index is done! results in '{args.output}'") return status + class Branchwater_Check(CommandLinePlugin): - command = 'check' - description = 'Check Branchwater RevIndex' + command = "check" + description = "Check Branchwater RevIndex" def __init__(self, p): super().__init__(p) - p.add_argument('index', - help="RocksDB index file created with 'index'") - p.add_argument('--quick', action='store_true') + p.add_argument("index", help="RocksDB index file created with 'index'") + p.add_argument("--quick", action="store_true") def main(self, args): notify(f"checking index '{args.index}'") @@ -293,121 +415,319 @@ def main(self, args): class Branchwater_Multisearch(CommandLinePlugin): - command = 'multisearch' - description = 'massively parallel in-memory sketch search' + command = "multisearch" + description = "massively parallel in-memory sketch search" def __init__(self, p): super().__init__(p) - p.add_argument('query_paths', - help="input file of sketches") - p.add_argument('against_paths', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='CSV output file for matches') - p.add_argument('-t', '--threshold', default=0.01, type=float, - help='containment threshold for reporting matches (default: 0.01)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-a', '--ani', action='store_true', - help='estimate ANI from containment') + p.add_argument("query_paths", help="input file of sketches") + p.add_argument("against_paths", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="CSV output file for matches" + ) + p.add_argument( + "-t", + "--threshold", + default=0.01, + type=float, + help="containment threshold for reporting matches (default: 0.01)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to do comparisons (default: determined from query collection)", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-a", "--ani", action="store_true", help="estimate ANI from containment" + ) + p.add_argument( + "-p", + "--prob-significant-overlap", + action="store_true", + help="estimate probability of overlap for significance ranking of search results, of the specific query and match, given all queries and possible matches", + ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}" + ) num_threads = set_thread_pool(args.cores) - notify(f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads" + ) + notify( + f"estimate ani? {args.ani} / estimate probability of overlap? {args.prob_significant_overlap}" + ) super().main(args) - status = sourmash_plugin_branchwater.do_multisearch(args.query_paths, - args.against_paths, - args.threshold, - args.ksize, - args.scaled, - args.moltype, - args.ani, - args.output) + status = sourmash_plugin_branchwater.do_multisearch( + args.query_paths, + args.against_paths, + args.threshold, + args.ksize, + args.scaled, + args.moltype, + args.ani, + args.prob_significant_overlap, + args.output_all_comparisons, + args.output, + ) if status == 0: notify(f"...multisearch is done! results in '{args.output}'") return status - + + class Branchwater_Pairwise(CommandLinePlugin): - command = 'pairwise' - description = 'massively parallel in-memory pairwise comparisons' + command = "pairwise" + description = "massively parallel in-memory pairwise comparisons" def __init__(self, p): super().__init__(p) - p.add_argument('sig_paths', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='CSV output file for matches') - p.add_argument('-t', '--threshold', default=0.01, type=float, - help='containment threshold for reporting matches') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-a', '--ani', action='store_true', - help='estimate ANI from containment') - p.add_argument('--write-all', action="store_true", - help="write self comparisons for all sketches") + p.add_argument("sig_paths", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="CSV output file for matches" + ) + p.add_argument( + "-t", + "--threshold", + default=0.01, + type=float, + help="containment threshold for reporting matches", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to do comparisons", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-a", "--ani", action="store_true", help="estimate ANI from containment" + ) + p.add_argument( + "--write-all", + "--write-self-comparisons", + action="store_true", + help="write self comparisons for all sketches", + ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}" + ) num_threads = set_thread_pool(args.cores) - notify(f"pairwise-comparing all sketches in '{args.sig_paths}' using {num_threads} threads") + notify( + f"pairwise-comparing all sketches in '{args.sig_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_pairwise(args.sig_paths, - args.threshold, - args.ksize, - args.scaled, - args.moltype, - args.ani, - args.write_all, - args.output) + status = sourmash_plugin_branchwater.do_pairwise( + args.sig_paths, + args.threshold, + args.ksize, + args.scaled, + args.moltype, + args.ani, + args.write_all, + args.output_all_comparisons, + args.output, + ) if status == 0: notify(f"...pairwise is done! results in '{args.output}'") return status +class Branchwater_SingleSketch(CommandLinePlugin): + command = "singlesketch" + description = "sketch a single sequence file" + + def __init__(self, p): + super().__init__(p) + p.add_argument("input_filename", help="input FASTA file or '-' for stdin") + p.add_argument( + "-o", + "--output", + required=True, + help="output file for the signature or - for stdout", + ) + p.add_argument( + "-p", + "--param-string", + action="append", + type=str, + default=[], + help="parameter string for sketching (default: k=31,scaled=1000)", + ) + p.add_argument( + "-n", + "--name", + help="optional name for the signature, default is the basename of input path", + ) + p.add_argument( + "-I", + "--input-moltype", + "--input-molecule-type", + choices=["DNA", "dna", "protein"], + default="DNA", + help="molecule type of input sequence (DNA or protein)", + ) + + def main(self, args): + print_version() + if not args.param_string: + args.param_string = ["k=31,scaled=1000,dna"] + + # Check and append 'dna' if no moltype is found in a param string + moltypes = ["dna", "protein", "dayhoff", "hp"] + updated_param_strings = [] + + for param in args.param_string: + # If no moltype is found in the current param string, append 'dna' + if not any(mt in param for mt in moltypes): + param += ",dna" + updated_param_strings.append(param) + + notify(f"params: {updated_param_strings}") + + # Convert the list of param strings to a single string + args.param_string = "_".join(updated_param_strings).lower() + + # If --name is not provided, default to input_filename, but if the source file is -, set name to empty string + signature_name = ( + args.name + if args.name + else ( + os.path.basename(args.input_filename) + if args.input_filename != "-" + else "" + ) + ) + + notify( + f"sketching file '{args.input_filename}' ({args.input_moltype}) with params '{args.param_string}' and name '{signature_name}' using a single thread" + ) + + super().main(args) + status = sourmash_plugin_branchwater.do_singlesketch( + args.input_filename, + args.input_moltype, + args.param_string, + args.output, + signature_name, + ) # Pass the name to Rust + if status == 0: + notify(f"...singlesketch is done! results in '{args.output}'") + return status + + class Branchwater_Manysketch(CommandLinePlugin): - command = 'manysketch' - description = 'massively parallel sketching' + command = "manysketch" + description = "massively parallel sketching" def __init__(self, p): super().__init__(p) - p.add_argument('fromfile_csv', help="a csv file containing paths to FASTA files. \ - Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'") - p.add_argument('-o', '--output', required=True, - help='output zip file for the signatures') - p.add_argument('-p', '--param-string', action='append', type=str, default=[], - help='parameter string for sketching (default: k=31,scaled=1000)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-s', '--singleton', action="store_true", - help='build one sketch per FASTA record, i.e. multiple sketches per FASTA file') - p.add_argument('-f', '--force', action="store_true", - help='allow use of individual FASTA files in more than more sketch') + p.add_argument( + "fromfile_csv", + help="a csv file containing paths to FASTA files. \ + Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'", + ) + p.add_argument( + "-o", "--output", required=True, help="output zip file for the signatures" + ) + p.add_argument( + "-p", + "--param-string", + action="append", + type=str, + default=[], + help="parameter string for sketching (default: k=31,scaled=1000)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-s", + "--singleton", + action="store_true", + help="build one sketch per FASTA record, i.e. multiple sketches per FASTA file", + ) + p.add_argument( + "-f", + "--force", + action="store_true", + help="allow use of individual FASTA files in more than more sketch", + ) def main(self, args): print_version() if not args.param_string: - args.param_string = ["k=31,scaled=1000"] + args.param_string = ["dna,k=31,scaled=1000"] notify(f"params: {args.param_string}") # convert to a single string for easier rust handling @@ -417,51 +737,135 @@ def main(self, args): num_threads = set_thread_pool(args.cores) - notify(f"sketching all files in '{args.fromfile_csv}' using {num_threads} threads") + notify( + f"sketching all files in '{args.fromfile_csv}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv, - args.param_string, - args.output, - args.singleton, - args.force) + status = sourmash_plugin_branchwater.do_manysketch( + args.fromfile_csv, + args.param_string, + args.output, + args.singleton, + args.force, + ) if status == 0: notify(f"...manysketch is done! results in '{args.output}'") return status + class Branchwater_Cluster(CommandLinePlugin): - command = 'cluster' + command = "cluster" description = 'cluster from "pairwise" or "multisearch" results' def __init__(self, p): super().__init__(p) - p.add_argument('pairwise_csv', help="a csv file containing similarity information. \ - Currently, only a branchwater 'pairwise' or 'multisearch' file will work") - p.add_argument('-o', '--output', required=True, - help='output csv file for the clusters') - p.add_argument('--cluster-sizes', default=None, - help='output file for the cluster size histogram') - p.add_argument('--similarity-column', type=str, default='average_containment_ani', - choices=['containment', 'max_containment', 'jaccard', 'average_containment_ani', 'max_containment_ani'], - help='column to use as similarity measure') - p.add_argument('-t', '--threshold', type=float, default=0.95, help="similarity threshold for clustering. Default: 95%% ANI (0.95)") - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') + p.add_argument( + "pairwise_csv", + help="a csv file containing similarity information. \ + Currently, only a branchwater 'pairwise' or 'multisearch' file will work", + ) + p.add_argument( + "-o", "--output", required=True, help="output csv file for the clusters" + ) + p.add_argument( + "--cluster-sizes", + default=None, + help="output file for the cluster size histogram", + ) + p.add_argument( + "--similarity-column", + type=str, + default="average_containment_ani", + choices=[ + "containment", + "max_containment", + "jaccard", + "average_containment_ani", + "max_containment_ani", + ], + help="column to use as similarity measure", + ) + p.add_argument( + "-t", + "--threshold", + type=float, + default=0.95, + help="similarity threshold for clustering. Default: 95%% ANI (0.95)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) def main(self, args): print_version() num_threads = set_thread_pool(args.cores) - notify(f"generating clusters for comparisons in '{args.pairwise_csv}' using {num_threads} threads") + notify( + f"generating clusters for comparisons in '{args.pairwise_csv}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_cluster(args.pairwise_csv, - args.output, - args.similarity_column, - args.threshold, - args.cluster_sizes) + status = sourmash_plugin_branchwater.do_cluster( + args.pairwise_csv, + args.output, + args.similarity_column, + args.threshold, + args.cluster_sizes, + ) if status == 0: notify(f"...clustering is done! results in '{args.output}'") notify(f" cluster counts in '{args.cluster_sizes}'") return status + + +### + +class BranchwaterManifestWrapper: + def __init__(self, mf_obj): + self.obj = mf_obj + + def _check_row_values(self): + return self.obj._check_row_values() + + @property + def rows(self): + return self.obj.rows + + +class BranchwaterCollectionWrapper: + def __init__(self, coll_obj): + self.obj = coll_obj + + @property + def location(self): + return self.obj.location + + @property + def is_database(self): + return self.obj.is_database + + @property + def has_manifest(self): + return self.obj.has_manifest + + @property + def manifest(self): + return BranchwaterManifestWrapper(self.obj.manifest) + + def __len__(self): + return len(self.obj) + + +def load_collection(path, *, traverse_yield_all=False, cache_size=0): + try: + coll_obj = api.api_load_collection(path, 31, 100_000, 'DNA') + return BranchwaterCollectionWrapper(coll_obj) + except: + raise IndexNotLoaded(f"branchwater could not load '{path}'") +load_collection.priority = 20