diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 942fa524..58b80788 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -11,17 +11,20 @@ __version__ = importlib.metadata.version("sourmash_plugin_branchwater") + def print_version(): - notify(f"=> sourmash_plugin_branchwater {__version__}; cite Irber et al., doi: 10.1101/2022.11.02.514947\n") + notify( + f"=> sourmash_plugin_branchwater {__version__}; cite Irber et al., doi: 10.1101/2022.11.02.514947\n" + ) def get_max_cores(): try: - if 'SLURM_CPUS_ON_NODE' in os.environ: - return int(os.environ['SLURM_CPUS_ON_NODE']) - elif 'SLURM_JOB_CPUS_PER_NODE' in os.environ: - cpus_per_node_str = os.environ['SLURM_JOB_CPUS_PER_NODE'] - return int(cpus_per_node_str.split('x')[0]) + if "SLURM_CPUS_ON_NODE" in os.environ: + return int(os.environ["SLURM_CPUS_ON_NODE"]) + elif "SLURM_JOB_CPUS_PER_NODE" in os.environ: + cpus_per_node_str = os.environ["SLURM_JOB_CPUS_PER_NODE"] + return int(cpus_per_node_str.split("x")[0]) else: return os.cpu_count() except Exception: @@ -32,58 +35,101 @@ def set_thread_pool(user_cores): avail_threads = get_max_cores() num_threads = min(avail_threads, user_cores) if user_cores else avail_threads if user_cores and user_cores > avail_threads: - notify(f"warning: only {avail_threads} threads available, using {avail_threads}") + notify( + f"warning: only {avail_threads} threads available, using {avail_threads}" + ) actual_rayon_cores = sourmash_plugin_branchwater.set_global_thread_pool(num_threads) return actual_rayon_cores class Branchwater_Manysearch(CommandLinePlugin): - command = 'manysearch' - description = 'search many metagenomes for contained genomes' + command = "manysearch" + description = "search many metagenomes for contained genomes" def __init__(self, p): super().__init__(p) - p.add_argument('query_paths', - help="input file of sketches") - p.add_argument('against_paths', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='CSV output file for matches') - p.add_argument('-t', '--threshold', default=0.01, type=float, - help='containment threshold for reporting matches (default: 0.01)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-P', '--pretty-print', action='store_true', - default=True, - help="display results after search finishes (default: True)") - p.add_argument('-N', '--no-pretty-print', action='store_false', - dest='pretty_print', - help="do not display results (e.g. for large output)") - p.add_argument('--ignore-abundance', action='store_true', - help="do not do expensive abundance calculations") + p.add_argument("query_paths", help="input file of sketches") + p.add_argument("against_paths", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="CSV output file for matches" + ) + p.add_argument( + "-t", + "--threshold", + default=0.01, + type=float, + help="containment threshold for reporting matches (default: 0.01)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=1000, + type=int, + help="scaled factor at which to do comparisons", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-P", + "--pretty-print", + action="store_true", + default=True, + help="display results after search finishes (default: True)", + ) + p.add_argument( + "-N", + "--no-pretty-print", + action="store_false", + dest="pretty_print", + help="do not display results (e.g. for large output)", + ) + p.add_argument( + "--ignore-abundance", + action="store_true", + help="do not do expensive abundance calculations", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}" + ) num_threads = set_thread_pool(args.cores) - notify(f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_manysearch(args.query_paths, - args.against_paths, - args.threshold, - args.ksize, - args.scaled, - args.moltype, - args.output, - args.ignore_abundance) + status = sourmash_plugin_branchwater.do_manysearch( + args.query_paths, + args.against_paths, + args.threshold, + args.ksize, + args.scaled, + args.moltype, + args.output, + args.ignore_abundance, + ) if status == 0: notify(f"...manysearch is done! results in '{args.output}'") @@ -93,46 +139,80 @@ def main(self, args): class Branchwater_Fastgather(CommandLinePlugin): - command = 'fastgather' - description = 'massively parallel sketch gather' + command = "fastgather" + description = "massively parallel sketch gather" def __init__(self, p): super().__init__(p) - p.add_argument('query_sig', help="metagenome sketch") - p.add_argument('against_paths', help="input file of sketches") - p.add_argument('-o', '--output-gather', required=True, - help="save gather output (minimum metagenome cover) to this file") - p.add_argument('--output-prefetch', - help="save prefetch output (all overlaps) to this file") - p.add_argument('-t', '--threshold-bp', default=50000, type=float, - help='threshold in estimated base pairs, for reporting matches (default: 50kb)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to do comparisons (default: 31)') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons (default: 1000)') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - + p.add_argument("query_sig", help="metagenome sketch") + p.add_argument("against_paths", help="input file of sketches") + p.add_argument( + "-o", + "--output-gather", + required=True, + help="save gather output (minimum metagenome cover) to this file", + ) + p.add_argument( + "--output-prefetch", help="save prefetch output (all overlaps) to this file" + ) + p.add_argument( + "-t", + "--threshold-bp", + default=50000, + type=float, + help="threshold in estimated base pairs, for reporting matches (default: 50kb)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to do comparisons (default: 31)", + ) + p.add_argument( + "-s", + "--scaled", + default=1000, + type=int, + help="scaled factor at which to do comparisons (default: 1000)", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp}" + ) num_threads = set_thread_pool(args.cores) - - notify(f"gathering all sketches in '{args.query_sig}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"gathering all sketches in '{args.query_sig}' against '{args.against_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_fastgather(args.query_sig, - args.against_paths, - int(args.threshold_bp), - args.ksize, - args.scaled, - args.moltype, - args.output_gather, - args.output_prefetch) + status = sourmash_plugin_branchwater.do_fastgather( + args.query_sig, + args.against_paths, + int(args.threshold_bp), + args.ksize, + args.scaled, + args.moltype, + args.output_gather, + args.output_prefetch, + ) if status == 0: notify(f"...fastgather is done! gather results in '{args.output_gather}'") if args.output_prefetch: @@ -141,107 +221,182 @@ def main(self, args): class Branchwater_Fastmultigather(CommandLinePlugin): - command = 'fastmultigather' - description = 'massively parallel sketch multigather' + command = "fastmultigather" + description = "massively parallel sketch multigather" def __init__(self, p): super().__init__(p) - p.add_argument('query_paths', help="input file of sketches to query") - p.add_argument('against_paths', help="input file of sketches to search against \ - OR a branchwater indexed database generated with 'sourmash scripts index'") - p.add_argument('-t', '--threshold-bp', default=50000, type=float, - help='threshold in estimated base pairs, for reporting matches (default: 50kb)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to do comparisons (default: 31)') - p.add_argument('-s', '--scaled', default=None, type=int, - help='scaled factor at which to do comparisons (default: determined from query collection)') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-o', '--output', help='CSV output file for matches. Used for non-rocksdb searches only.') - p.add_argument('--create-empty-results', action = 'store_true', - default=False, help='create empty results file(s) even if no matches') - p.add_argument('--save-matches', action='store_true', - default=False, help='save matched hashes for every input to a signature') - + p.add_argument("query_paths", help="input file of sketches to query") + p.add_argument( + "against_paths", + help="input file of sketches to search against \ + OR a branchwater indexed database generated with 'sourmash scripts index'", + ) + p.add_argument( + "-t", + "--threshold-bp", + default=50000, + type=float, + help="threshold in estimated base pairs, for reporting matches (default: 50kb)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to do comparisons (default: 31)", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to do comparisons (default: determined from query collection)", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-o", + "--output", + help="CSV output file for matches. Used for non-rocksdb searches only.", + ) + p.add_argument( + "--create-empty-results", + action="store_true", + default=False, + help="create empty results file(s) even if no matches", + ) + p.add_argument( + "--save-matches", + action="store_true", + default=False, + help="save matched hashes for every input to a signature", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp} / save matches: {args.save_matches}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold bp: {args.threshold_bp} / save matches: {args.save_matches}" + ) num_threads = set_thread_pool(args.cores) - notify(f"gathering all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"gathering all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_fastmultigather(args.query_paths, - args.against_paths, - int(args.threshold_bp), - args.ksize, - args.scaled, - args.moltype, - args.output, - args.save_matches, - args.create_empty_results - ) + status = sourmash_plugin_branchwater.do_fastmultigather( + args.query_paths, + args.against_paths, + int(args.threshold_bp), + args.ksize, + args.scaled, + args.moltype, + args.output, + args.save_matches, + args.create_empty_results, + ) if status == 0: notify(f"...fastmultigather is done!") return status class Branchwater_Index(CommandLinePlugin): - command = 'index' - description = 'Build Branchwater RevIndex' + command = "index" + description = "Build Branchwater RevIndex" def __init__(self, p): super().__init__(p) - p.add_argument('siglist', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='output file for the index') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('--internal-storage', default=True, action='store_true', - help="build indexes that contain sketches and are relocatable (default: True)") - p.add_argument('--no-internal-storage', '--no-store-sketches', - action='store_false', - help="do not store sketches in the index; index may not be relocatable (default: False)", - dest='internal_storage') + p.add_argument("siglist", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="output file for the index" + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=1000, + type=int, + help="scaled factor at which to do comparisons", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "--internal-storage", + default=True, + action="store_true", + help="build indexes that contain sketches and are relocatable (default: True)", + ) + p.add_argument( + "--no-internal-storage", + "--no-store-sketches", + action="store_false", + help="do not store sketches in the index; index may not be relocatable (default: False)", + dest="internal_storage", + ) def main(self, args): - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} ") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} " + ) num_threads = set_thread_pool(args.cores) notify(f"indexing all sketches in '{args.siglist}'") super().main(args) - status = sourmash_plugin_branchwater.do_index(args.siglist, - args.ksize, - args.scaled, - args.moltype, - args.output, - False, # colors - currently must be false? - args.internal_storage) + status = sourmash_plugin_branchwater.do_index( + args.siglist, + args.ksize, + args.scaled, + args.moltype, + args.output, + False, # colors - currently must be false? + args.internal_storage, + ) if status == 0: notify(f"...index is done! results in '{args.output}'") return status + class Branchwater_Check(CommandLinePlugin): - command = 'check' - description = 'Check Branchwater RevIndex' + command = "check" + description = "Check Branchwater RevIndex" def __init__(self, p): super().__init__(p) - p.add_argument('index', - help="RocksDB index file created with 'index'") - p.add_argument('--quick', action='store_true') + p.add_argument("index", help="RocksDB index file created with 'index'") + p.add_argument("--quick", action="store_true") def main(self, args): notify(f"checking index '{args.index}'") @@ -253,109 +408,191 @@ def main(self, args): class Branchwater_Multisearch(CommandLinePlugin): - command = 'multisearch' - description = 'massively parallel in-memory sketch search' + command = "multisearch" + description = "massively parallel in-memory sketch search" def __init__(self, p): super().__init__(p) - p.add_argument('query_paths', - help="input file of sketches") - p.add_argument('against_paths', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='CSV output file for matches') - p.add_argument('-t', '--threshold', default=0.01, type=float, - help='containment threshold for reporting matches (default: 0.01)') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=None, type=int, - help='scaled factor at which to do comparisons (default: determined from query collection)') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-a', '--ani', action='store_true', - help='estimate ANI from containment') + p.add_argument("query_paths", help="input file of sketches") + p.add_argument("against_paths", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="CSV output file for matches" + ) + p.add_argument( + "-t", + "--threshold", + default=0.01, + type=float, + help="containment threshold for reporting matches (default: 0.01)", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=None, + type=int, + help="scaled factor at which to do comparisons (default: determined from query collection)", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-a", "--ani", action="store_true", help="estimate ANI from containment" + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}" + ) num_threads = set_thread_pool(args.cores) - notify(f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") + notify( + f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_multisearch(args.query_paths, - args.against_paths, - args.threshold, - args.ksize, - args.scaled, - args.moltype, - args.ani, - args.output) + status = sourmash_plugin_branchwater.do_multisearch( + args.query_paths, + args.against_paths, + args.threshold, + args.ksize, + args.scaled, + args.moltype, + args.ani, + args.output, + ) if status == 0: notify(f"...multisearch is done! results in '{args.output}'") return status - + + class Branchwater_Pairwise(CommandLinePlugin): - command = 'pairwise' - description = 'massively parallel in-memory pairwise comparisons' + command = "pairwise" + description = "massively parallel in-memory pairwise comparisons" def __init__(self, p): super().__init__(p) - p.add_argument('sig_paths', - help="input file of sketches") - p.add_argument('-o', '--output', required=True, - help='CSV output file for matches') - p.add_argument('-t', '--threshold', default=0.01, type=float, - help='containment threshold for reporting matches') - p.add_argument('-k', '--ksize', default=31, type=int, - help='k-mer size at which to select sketches') - p.add_argument('-s', '--scaled', default=1000, type=int, - help='scaled factor at which to do comparisons') - p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], - help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-a', '--ani', action='store_true', - help='estimate ANI from containment') - p.add_argument('--write-all', action="store_true", - help="write self comparisons for all sketches") + p.add_argument("sig_paths", help="input file of sketches") + p.add_argument( + "-o", "--output", required=True, help="CSV output file for matches" + ) + p.add_argument( + "-t", + "--threshold", + default=0.01, + type=float, + help="containment threshold for reporting matches", + ) + p.add_argument( + "-k", + "--ksize", + default=31, + type=int, + help="k-mer size at which to select sketches", + ) + p.add_argument( + "-s", + "--scaled", + default=1000, + type=int, + help="scaled factor at which to do comparisons", + ) + p.add_argument( + "-m", + "--moltype", + default="DNA", + choices=["DNA", "protein", "dayhoff", "hp"], + help="molecule type (DNA, protein, dayhoff, or hp; default DNA)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-a", "--ani", action="store_true", help="estimate ANI from containment" + ) + p.add_argument( + "--write-all", + action="store_true", + help="write self comparisons for all sketches", + ) def main(self, args): print_version() - notify(f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}") + notify( + f"ksize: {args.ksize} / scaled: {args.scaled} / moltype: {args.moltype} / threshold: {args.threshold}" + ) num_threads = set_thread_pool(args.cores) - notify(f"pairwise-comparing all sketches in '{args.sig_paths}' using {num_threads} threads") + notify( + f"pairwise-comparing all sketches in '{args.sig_paths}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_pairwise(args.sig_paths, - args.threshold, - args.ksize, - args.scaled, - args.moltype, - args.ani, - args.write_all, - args.output) + status = sourmash_plugin_branchwater.do_pairwise( + args.sig_paths, + args.threshold, + args.ksize, + args.scaled, + args.moltype, + args.ani, + args.write_all, + args.output, + ) if status == 0: notify(f"...pairwise is done! results in '{args.output}'") return status + class Branchwater_SingleSketch(CommandLinePlugin): - command = 'singlesketch' - description = 'sketch a single sequence file' + command = "singlesketch" + description = "sketch a single sequence file" def __init__(self, p): super().__init__(p) - p.add_argument('input_filename', help="input FASTA file or '-' for stdin") - p.add_argument('-o', '--output', required=True, - help='output file for the signature or - for stdout') - p.add_argument('-p', '--param-string', action='append', type=str, default=[], - help='parameter string for sketching (default: k=31,scaled=1000)') - p.add_argument('-n', '--name', help="optional name for the signature, default is the basename of input path") + p.add_argument("input_filename", help="input FASTA file or '-' for stdin") + p.add_argument( + "-o", + "--output", + required=True, + help="output file for the signature or - for stdout", + ) + p.add_argument( + "-p", + "--param-string", + action="append", + type=str, + default=[], + help="parameter string for sketching (default: k=31,scaled=1000)", + ) + p.add_argument( + "-n", + "--name", + help="optional name for the signature, default is the basename of input path", + ) def main(self, args): print_version() @@ -378,38 +615,70 @@ def main(self, args): args.param_string = "_".join(updated_param_strings).lower() # If --name is not provided, default to input_filename, but if the source file is -, set name to empty string - signature_name = args.name if args.name else os.path.basename(args.input_filename) if args.input_filename != "-" else "" - - notify(f"sketching file '{args.input_filename}' with params '{args.param_string}' and name '{signature_name}'") + signature_name = ( + args.name + if args.name + else ( + os.path.basename(args.input_filename) + if args.input_filename != "-" + else "" + ) + ) + + notify( + f"sketching file '{args.input_filename}' with params '{args.param_string}' and name '{signature_name}'" + ) super().main(args) - status = sourmash_plugin_branchwater.do_singlesketch(args.input_filename, - args.param_string, - args.output, - signature_name) # Pass the name to Rust + status = sourmash_plugin_branchwater.do_singlesketch( + args.input_filename, args.param_string, args.output, signature_name + ) # Pass the name to Rust if status == 0: notify(f"...singlesketch is done! results in '{args.output}'") return status class Branchwater_Manysketch(CommandLinePlugin): - command = 'manysketch' - description = 'massively parallel sketching' + command = "manysketch" + description = "massively parallel sketching" def __init__(self, p): super().__init__(p) - p.add_argument('fromfile_csv', help="a csv file containing paths to FASTA files. \ - Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'") - p.add_argument('-o', '--output', required=True, - help='output zip file for the signatures') - p.add_argument('-p', '--param-string', action='append', type=str, default=[], - help='parameter string for sketching (default: k=31,scaled=1000)') - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') - p.add_argument('-s', '--singleton', action="store_true", - help='build one sketch per FASTA record, i.e. multiple sketches per FASTA file') - p.add_argument('-f', '--force', action="store_true", - help='allow use of individual FASTA files in more than more sketch') + p.add_argument( + "fromfile_csv", + help="a csv file containing paths to FASTA files. \ + Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'", + ) + p.add_argument( + "-o", "--output", required=True, help="output zip file for the signatures" + ) + p.add_argument( + "-p", + "--param-string", + action="append", + type=str, + default=[], + help="parameter string for sketching (default: k=31,scaled=1000)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) + p.add_argument( + "-s", + "--singleton", + action="store_true", + help="build one sketch per FASTA record, i.e. multiple sketches per FASTA file", + ) + p.add_argument( + "-f", + "--force", + action="store_true", + help="allow use of individual FASTA files in more than more sketch", + ) def main(self, args): print_version() @@ -424,50 +693,87 @@ def main(self, args): num_threads = set_thread_pool(args.cores) - notify(f"sketching all files in '{args.fromfile_csv}' using {num_threads} threads") + notify( + f"sketching all files in '{args.fromfile_csv}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv, - args.param_string, - args.output, - args.singleton, - args.force) + status = sourmash_plugin_branchwater.do_manysketch( + args.fromfile_csv, + args.param_string, + args.output, + args.singleton, + args.force, + ) if status == 0: notify(f"...manysketch is done! results in '{args.output}'") return status + class Branchwater_Cluster(CommandLinePlugin): - command = 'cluster' + command = "cluster" description = 'cluster from "pairwise" or "multisearch" results' def __init__(self, p): super().__init__(p) - p.add_argument('pairwise_csv', help="a csv file containing similarity information. \ - Currently, only a branchwater 'pairwise' or 'multisearch' file will work") - p.add_argument('-o', '--output', required=True, - help='output csv file for the clusters') - p.add_argument('--cluster-sizes', default=None, - help='output file for the cluster size histogram') - p.add_argument('--similarity-column', type=str, default='average_containment_ani', - choices=['containment', 'max_containment', 'jaccard', 'average_containment_ani', 'max_containment_ani'], - help='column to use as similarity measure') - p.add_argument('-t', '--threshold', type=float, default=0.95, help="similarity threshold for clustering. Default: 95%% ANI (0.95)") - p.add_argument('-c', '--cores', default=0, type=int, - help='number of cores to use (default is all available)') + p.add_argument( + "pairwise_csv", + help="a csv file containing similarity information. \ + Currently, only a branchwater 'pairwise' or 'multisearch' file will work", + ) + p.add_argument( + "-o", "--output", required=True, help="output csv file for the clusters" + ) + p.add_argument( + "--cluster-sizes", + default=None, + help="output file for the cluster size histogram", + ) + p.add_argument( + "--similarity-column", + type=str, + default="average_containment_ani", + choices=[ + "containment", + "max_containment", + "jaccard", + "average_containment_ani", + "max_containment_ani", + ], + help="column to use as similarity measure", + ) + p.add_argument( + "-t", + "--threshold", + type=float, + default=0.95, + help="similarity threshold for clustering. Default: 95%% ANI (0.95)", + ) + p.add_argument( + "-c", + "--cores", + default=0, + type=int, + help="number of cores to use (default is all available)", + ) def main(self, args): print_version() num_threads = set_thread_pool(args.cores) - notify(f"generating clusters for comparisons in '{args.pairwise_csv}' using {num_threads} threads") + notify( + f"generating clusters for comparisons in '{args.pairwise_csv}' using {num_threads} threads" + ) super().main(args) - status = sourmash_plugin_branchwater.do_cluster(args.pairwise_csv, - args.output, - args.similarity_column, - args.threshold, - args.cluster_sizes) + status = sourmash_plugin_branchwater.do_cluster( + args.pairwise_csv, + args.output, + args.similarity_column, + args.threshold, + args.cluster_sizes, + ) if status == 0: notify(f"...clustering is done! results in '{args.output}'") notify(f" cluster counts in '{args.cluster_sizes}'") diff --git a/src/python/sourmash_plugin_branchwater/prettyprint.py b/src/python/sourmash_plugin_branchwater/prettyprint.py index f948f792..3390d7f6 100644 --- a/src/python/sourmash_plugin_branchwater/prettyprint.py +++ b/src/python/sourmash_plugin_branchwater/prettyprint.py @@ -1,16 +1,17 @@ import csv + def pretty_print_manysearch(manysearch_csv): "Pretty-print the manysearch output." - with open(manysearch_csv, newline='') as fp: + with open(manysearch_csv, newline="") as fp: r = csv.DictReader(fp) rows = list(r) - rows.sort(key=lambda row: row['query_name']) # sort by metagenome, for now + rows.sort(key=lambda row: row["query_name"]) # sort by metagenome, for now first = True for row in rows: - has_abundance = 'average_abund' in row + has_abundance = "average_abund" in row # # display! @@ -23,21 +24,25 @@ def pretty_print_manysearch(manysearch_csv): print("-------- -------- --------- ------- ---------------") first = False - f_genome_found = float(row['containment']) + f_genome_found = float(row["containment"]) pct_genome = f"{f_genome_found*100:.1f}" if has_abundance: - n_weighted_found = int(row['n_weighted_found']) - total_weighted_hashes = int(row['total_weighted_hashes']) - f_metag_weighted = n_weighted_found / total_weighted_hashes # results_d['f_match_weighted'] + n_weighted_found = int(row["n_weighted_found"]) + total_weighted_hashes = int(row["total_weighted_hashes"]) + f_metag_weighted = ( + n_weighted_found / total_weighted_hashes + ) # results_d['f_match_weighted'] pct_metag = f"{f_metag_weighted*100:.1f}%" - avg_abund = float(row['average_abund']) + avg_abund = float(row["average_abund"]) avg_abund = f"{avg_abund:.1f}" else: avg_abund = "N/A" pct_metag = "N/A" - query_name = row['query_name'][:17] - metag_name = row['match_name'][:17] - print(f'{query_name:<17} {pct_genome:>6}% {avg_abund:>6} {pct_metag:>6} {metag_name}') + query_name = row["query_name"][:17] + metag_name = row["match_name"][:17] + print( + f"{query_name:<17} {pct_genome:>6}% {avg_abund:>6} {pct_metag:>6} {metag_name}" + ) diff --git a/src/python/tests/__init__.py b/src/python/tests/__init__.py index c1a35185..384ea8b8 100644 --- a/src/python/tests/__init__.py +++ b/src/python/tests/__init__.py @@ -1,2 +1,3 @@ from sourmash_plugin_branchwater import sourmash_plugin_branchwater + sourmash_plugin_branchwater.set_global_thread_pool(4) diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index f6f0f7f4..75a4b8f4 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -2,6 +2,7 @@ from .sourmash_tst_utils import TempDirectory, RunnerContext + @pytest.fixture def runtmp(): with TempDirectory() as location: @@ -12,26 +13,32 @@ def runtmp(): def toggle_internal_storage(request): return request.param + @pytest.fixture(params=[True, False]) def zip_query(request): return request.param + @pytest.fixture(params=[True, False]) def zip_db(request): return request.param + @pytest.fixture(params=[True, False]) def zip_against(request): return request.param + @pytest.fixture(params=[True, False]) def indexed(request): return request.param + @pytest.fixture(params=[True, False]) def indexed_query(request): return request.param + @pytest.fixture(params=[True, False]) def indexed_against(request): return request.param diff --git a/src/python/tests/sourmash_tst_utils.py b/src/python/tests/sourmash_tst_utils.py index 0c0e0e00..dabce721 100644 --- a/src/python/tests/sourmash_tst_utils.py +++ b/src/python/tests/sourmash_tst_utils.py @@ -16,31 +16,49 @@ def get_test_data(filename): thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) + return os.path.join(thisdir, "test-data", filename) def make_file_list(filename, paths): - with open(filename, 'wt') as fp: + with open(filename, "wt") as fp: fp.write("\n".join(paths)) fp.write("\n") def zip_siglist(runtmp, siglist, db): - runtmp.sourmash('sig', 'cat', siglist, - '-o', db) + runtmp.sourmash("sig", "cat", siglist, "-o", db) return db -def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA', - toggle_internal_storage='--internal-storage'): +def index_siglist( + runtmp, + siglist, + db, + *, + ksize=31, + scaled=1000, + moltype="DNA", + toggle_internal_storage="--internal-storage", +): # build index - runtmp.sourmash('scripts', 'index', siglist, - '-o', db, '-k', str(ksize), '--scaled', str(scaled), - '--moltype', moltype, toggle_internal_storage) + runtmp.sourmash( + "scripts", + "index", + siglist, + "-o", + db, + "-k", + str(ksize), + "--scaled", + str(scaled), + "--moltype", + moltype, + toggle_internal_storage, + ) return db -def scriptpath(scriptname='sourmash'): +def scriptpath(scriptname="sourmash"): """Return the path to the scripts, in both dev and install situations.""" # note - it doesn't matter what the scriptname is here, as long as # it's some script present in this version of sourmash. @@ -53,7 +71,7 @@ def scriptpath(scriptname='sourmash'): if os.path.exists(os.path.join(path, scriptname)): return path - for path in os.environ['PATH'].split(':'): + for path in os.environ["PATH"].split(":"): if os.path.exists(os.path.join(path, scriptname)): return path @@ -61,10 +79,10 @@ def scriptpath(scriptname='sourmash'): def _runscript(scriptname): """Find & run a script with exec (i.e. not via os.system or subprocess).""" namespace = {"__name__": "__main__"} - namespace['sys'] = globals()['sys'] + namespace["sys"] = globals()["sys"] try: - pkg_resources.load_entry_point("sourmash", 'console_scripts', scriptname)() + pkg_resources.load_entry_point("sourmash", "console_scripts", scriptname)() return 0 except pkg_resources.ResolutionError: pass @@ -75,15 +93,15 @@ def _runscript(scriptname): if os.path.isfile(scriptfile): if os.path.isfile(scriptfile): exec( # pylint: disable=exec-used - compile(open(scriptfile).read(), scriptfile, 'exec'), - namespace) + compile(open(scriptfile).read(), scriptfile, "exec"), namespace + ) return 0 return -1 -ScriptResults = collections.namedtuple('ScriptResults', - ['status', 'out', 'err']) +ScriptResults = collections.namedtuple("ScriptResults", ["status", "out", "err"]) + def runscript(scriptname, args, **kwargs): """Run a Python script using exec(). @@ -99,8 +117,8 @@ def runscript(scriptname, args, **kwargs): sysargs.extend(args) cwd = os.getcwd() - in_directory = kwargs.get('in_directory', cwd) - fail_ok = kwargs.get('fail_ok', False) + in_directory = kwargs.get("in_directory", cwd) + fail_ok = kwargs.get("fail_ok", False) try: status = -1 @@ -108,8 +126,8 @@ def runscript(scriptname, args, **kwargs): sys.argv = sysargs oldin = None - if 'stdin_data' in kwargs: - oldin, sys.stdin = sys.stdin, StringIO(kwargs['stdin_data']) + if "stdin_data" in kwargs: + oldin, sys.stdin = sys.stdin, StringIO(kwargs["stdin_data"]) oldout, olderr = sys.stdout, sys.stderr sys.stdout = StringIO() @@ -119,8 +137,8 @@ def runscript(scriptname, args, **kwargs): os.chdir(in_directory) try: - print('running:', scriptname, 'in:', in_directory, file=oldout) - print('arguments', sysargs, file=oldout) + print("running:", scriptname, "in:", in_directory, file=oldout) + print("arguments", sysargs, file=oldout) status = _runscript(scriptname) except SystemExit as err: @@ -150,7 +168,7 @@ def runscript(scriptname, args, **kwargs): class TempDirectory(object): def __init__(self): - self.tempdir = tempfile.mkdtemp(prefix='sourmashtest_') + self.tempdir = tempfile.mkdtemp(prefix="sourmashtest_") def __enter__(self): return self.tempdir @@ -168,7 +186,7 @@ def __exit__(self, exc_type, exc_value, traceback): class SourmashCommandFailed(Exception): def __init__(self, msg): Exception.__init__(self, msg) - self.message = msg + self.message = msg class RunnerContext(object): @@ -181,6 +199,7 @@ class RunnerContext(object): You can use the 'output' method to build filenames in my temp directory. """ + def __init__(self, location): self.location = location self.last_command = None @@ -188,25 +207,26 @@ def __init__(self, location): def run_sourmash(self, *args, **kwargs): "Run the sourmash script with the given arguments." - kwargs['fail_ok'] = True - if 'in_directory' not in kwargs: - kwargs['in_directory'] = self.location + kwargs["fail_ok"] = True + if "in_directory" not in kwargs: + kwargs["in_directory"] = self.location - cmdlist = ['sourmash'] - cmdlist.extend(( str(x) for x in args)) + cmdlist = ["sourmash"] + cmdlist.extend((str(x) for x in args)) self.last_command = " ".join(cmdlist) - self.last_result = runscript('sourmash', args, **kwargs) + self.last_result = runscript("sourmash", args, **kwargs) if self.last_result.status: raise SourmashCommandFailed(self.last_result.err) return self.last_result + sourmash = run_sourmash def run(self, scriptname, *args, **kwargs): "Run a script with the given arguments." - if 'in_directory' not in kwargs: - kwargs['in_directory'] = self.location + if "in_directory" not in kwargs: + kwargs["in_directory"] = self.location self.last_command = " ".join(args) self.last_result = runscript(scriptname, args, **kwargs) return self.last_result @@ -224,11 +244,11 @@ def __str__(self): if self.last_result.out: s += "- stdout:\n---\n{}---\n".format(self.last_result.out) else: - s += '(no stdout)\n\n' + s += "(no stdout)\n\n" if self.last_result.err: s += "- stderr:\n---\n{}---\n".format(self.last_result.err) else: - s += '(no stderr)\n' + s += "(no stderr)\n" return s diff --git a/src/python/tests/test_cluster.py b/src/python/tests/test_cluster.py index 4ae12173..c986c1a2 100644 --- a/src/python/tests/test_cluster.py +++ b/src/python/tests/test_cluster.py @@ -7,14 +7,14 @@ def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'cluster') + runtmp.sourmash("scripts", "cluster") - assert 'usage: cluster' in runtmp.last_result.err + assert "usage: cluster" in runtmp.last_result.err def test_cluster_help(runtmp): # test sourmash scripts cluster --help /-h - runtmp.sourmash('scripts', 'cluster', '-h') + runtmp.sourmash("scripts", "cluster", "-h") print(runtmp.last_result.err) out = runtmp.last_result.out @@ -25,170 +25,208 @@ def test_cluster_help(runtmp): assert "options:" in out -def test_cluster_containment(runtmp): - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - threshold = '0.5' - - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "containment", "--cluster-sizes", - sizes, '--threshold', threshold) +def test_cluster_containment(runtmp): + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + threshold = "0.5" + + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "containment", + "--cluster-sizes", + sizes, + "--threshold", + threshold, + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' - expected = set("n2;n3;n7;n1;n6;n5;n4".split(';')) - assert set(rows[0]['nodes'].split(';')) == expected + assert rows[0]["cluster"] == "Component_1" + expected = set("n2;n3;n7;n1;n6;n5;n4".split(";")) + assert set(rows[0]["nodes"].split(";")) == expected # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}" - assert rows[0]['cluster_size'] == '7' - assert rows[0]['count'] == '1' + assert rows[0]["cluster_size"] == "7" + assert rows[0]["count"] == "1" def test_cluster_max_containment_1(runtmp): - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - threshold = '0.7' - - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "max_containment", "--cluster-sizes", - sizes, '--threshold', threshold) + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + threshold = "0.7" + + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "max_containment", + "--cluster-sizes", + sizes, + "--threshold", + threshold, + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' - expected = set("n2;n3;n7;n1;n6;n5;n4".split(';')) - assert set(rows[0]['nodes'].split(';')) == expected + assert rows[0]["cluster"] == "Component_1" + expected = set("n2;n3;n7;n1;n6;n5;n4".split(";")) + assert set(rows[0]["nodes"].split(";")) == expected # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 1, f"Expected 1 data row but found {len(rows)}" - assert rows[0]['cluster_size'] == '7' - assert rows[0]['count'] == '1' + assert rows[0]["cluster_size"] == "7" + assert rows[0]["count"] == "1" def test_cluster_max_containment_2(runtmp): - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - threshold = '0.9' - - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "max_containment", "--cluster-sizes", - sizes, '--threshold', threshold) + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + threshold = "0.9" + + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "max_containment", + "--cluster-sizes", + sizes, + "--threshold", + threshold, + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' + assert rows[0]["cluster"] == "Component_1" expected_node_sets = [ - set("n1;n2;n3;n4;n5".split(';')), - set("n6;n7".split(';')), + set("n1;n2;n3;n4;n5".split(";")), + set("n6;n7".split(";")), ] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" rows_as_tuples = {tuple(row.values()) for row in rows} - expected = {('5', '1'), ('2', '1')} + expected = {("5", "1"), ("2", "1")} assert rows_as_tuples == expected -def test_cluster_jaccard(runtmp): - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - threshold = '0.6' - - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "jaccard", "--cluster-sizes", - sizes, '--threshold', threshold) +def test_cluster_jaccard(runtmp): + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + threshold = "0.6" + + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "jaccard", + "--cluster-sizes", + sizes, + "--threshold", + threshold, + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 4, f"Expected 4 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' + assert rows[0]["cluster"] == "Component_1" expected_node_sets = [ - set("n3;n4;n5;n6".split(';')), - set("n1".split(';')), - set("n2".split(';')), - set("n7".split(';')) + set("n3;n4;n5;n6".split(";")), + set("n1".split(";")), + set("n2".split(";")), + set("n7".split(";")), ] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" rows_as_tuples = {tuple(row.values()) for row in rows} - expected = {('1', '3'), ('4', '1')} + expected = {("1", "3"), ("4", "1")} assert rows_as_tuples == expected def test_cluster_default_similarity(runtmp): - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - threshold = '0.9' + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + threshold = "0.9" - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--threshold', threshold) + runtmp.sourmash( + "scripts", "cluster", pairwise_csv, "-o", output, "--threshold", threshold + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' - expected_node_sets = [ - set("n1;n2;n3;n4;n5".split(';')), - set("n6;n7".split(';')) - ] + assert rows[0]["cluster"] == "Component_1" + expected_node_sets = [set("n1;n2;n3;n4;n5".split(";")), set("n6;n7".split(";"))] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram assert not os.path.exists(sizes) @@ -196,270 +234,346 @@ def test_cluster_default_similarity(runtmp): def test_cluster_default_threshold(runtmp): # test default threshold (0.95) - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output) + runtmp.sourmash("scripts", "cluster", pairwise_csv, "-o", output) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 5, f"Expected 5 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' + assert rows[0]["cluster"] == "Component_1" expected_node_sets = [ - set("n1".split(';')), - set("n2;n3;n4".split(';')), - set("n5".split(';')), - set("n6".split(';')), - set("n7".split(';')) + set("n1".split(";")), + set("n2;n3;n4".split(";")), + set("n5".split(";")), + set("n6".split(";")), + set("n7".split(";")), ] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram assert not os.path.exists(sizes) def test_cluster_ani(runtmp): - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - threshold = '0.9' - - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "average_containment_ani", "--cluster-sizes", - sizes, '--threshold', threshold) + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + threshold = "0.9" + + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "average_containment_ani", + "--cluster-sizes", + sizes, + "--threshold", + threshold, + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' - expected_node_sets = [ - set("n1;n2;n3;n4;n5".split(';')), - set("n6;n7".split(';')) - ] + assert rows[0]["cluster"] == "Component_1" + expected_node_sets = [set("n1;n2;n3;n4;n5".split(";")), set("n6;n7".split(";"))] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" rows_as_tuples = {tuple(row.values()) for row in rows} - expected = {('5', '1'), ('2', '1')} + expected = {("5", "1"), ("2", "1")} assert rows_as_tuples == expected def test_cluster_max_ani(runtmp): - pairwise_csv = get_test_data('cluster.pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - threshold = '0.9' - - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "max_containment_ani", "--cluster-sizes", - sizes, '--threshold', threshold) + pairwise_csv = get_test_data("cluster.pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + threshold = "0.9" + + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "max_containment_ani", + "--cluster-sizes", + sizes, + "--threshold", + threshold, + ) - assert os.path.exists(output) + assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' - expected_node_sets = [set("n1;n2;n3;n4;n5".split(';')), set("n6;n7".split(';'))] + assert rows[0]["cluster"] == "Component_1" + expected_node_sets = [set("n1;n2;n3;n4;n5".split(";")), set("n6;n7".split(";"))] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" rows_as_tuples = {tuple(row.values()) for row in rows} - expected = {('5', '1'), ('2', '1')} + expected = {("5", "1"), ("2", "1")} assert rows_as_tuples == expected def test_cluster_ani_pairwise(runtmp): - pairwise_csv = runtmp.output('pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - cluster_threshold = '0.90' + pairwise_csv = runtmp.output("pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + cluster_threshold = "0.90" - query_list = runtmp.output('query.txt') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query_list = runtmp.output("query.txt") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', pairwise_csv, "-t", "-0.1", "--ani") + runtmp.sourmash( + "scripts", "pairwise", query_list, "-o", pairwise_csv, "-t", "-0.1", "--ani" + ) assert os.path.exists(pairwise_csv) - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "average_containment_ani", "--cluster-sizes", - sizes, '--threshold', cluster_threshold) + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "average_containment_ani", + "--cluster-sizes", + sizes, + "--threshold", + cluster_threshold, + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] print(rows) assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' - expected_node_sets = [set("NC_009661.1;NC_011665.1".split(';')), set("CP001071.1".split(';'))] + assert rows[0]["cluster"] == "Component_1" + expected_node_sets = [ + set("NC_009661.1;NC_011665.1".split(";")), + set("CP001071.1".split(";")), + ] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" rows_as_tuples = {tuple(row.values()) for row in rows} - expected = {('1', '1'), ('2', '1')} + expected = {("1", "1"), ("2", "1")} assert rows_as_tuples == expected def test_cluster_avg_ani_no_ani(runtmp, capfd): - pairwise_csv = runtmp.output('pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - cluster_threshold = '0.9' + pairwise_csv = runtmp.output("pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + cluster_threshold = "0.9" - query_list = runtmp.output('query.txt') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query_list = runtmp.output("query.txt") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', pairwise_csv, "-t", "-0.1") # do not pass `--ani` + runtmp.sourmash( + "scripts", "pairwise", query_list, "-o", pairwise_csv, "-t", "-0.1" + ) # do not pass `--ani` assert os.path.exists(pairwise_csv) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "average_containment_ani", "--cluster-sizes", - sizes, '--threshold', cluster_threshold) + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "average_containment_ani", + "--cluster-sizes", + sizes, + "--threshold", + cluster_threshold, + ) print(runtmp.last_result.err) captured = capfd.readouterr() print(captured.err) - assert 'average_containment_ani is None. Did you estimate ANI?' in captured.err + assert "average_containment_ani is None. Did you estimate ANI?" in captured.err def test_cluster_max_ani_no_ani(runtmp, capfd): - pairwise_csv = runtmp.output('pairwise.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - cluster_threshold = '0.9' + pairwise_csv = runtmp.output("pairwise.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + cluster_threshold = "0.9" - query_list = runtmp.output('query.txt') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query_list = runtmp.output("query.txt") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', pairwise_csv, "-t", "-0.1") # do not pass `--ani` + runtmp.sourmash( + "scripts", "pairwise", query_list, "-o", pairwise_csv, "-t", "-0.1" + ) # do not pass `--ani` assert os.path.exists(pairwise_csv) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'cluster', pairwise_csv, '-o', output, - '--similarity-column', "max_containment_ani", "--cluster-sizes", - sizes, '--threshold', cluster_threshold) + runtmp.sourmash( + "scripts", + "cluster", + pairwise_csv, + "-o", + output, + "--similarity-column", + "max_containment_ani", + "--cluster-sizes", + sizes, + "--threshold", + cluster_threshold, + ) print(runtmp.last_result.err) captured = capfd.readouterr() print(captured.err) - assert 'max_containment_ani is None. Did you estimate ANI?' in captured.err + assert "max_containment_ani is None. Did you estimate ANI?" in captured.err def test_cluster_ani_multisearch(runtmp): - multisearch_csv = runtmp.output('multisearch.csv') - output = runtmp.output('clusters.csv') - sizes = runtmp.output('sizes.csv') - cluster_threshold = '0.90' + multisearch_csv = runtmp.output("multisearch.csv") + output = runtmp.output("clusters.csv") + sizes = runtmp.output("sizes.csv") + cluster_threshold = "0.90" - query_list = runtmp.output('query.txt') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query_list = runtmp.output("query.txt") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - runtmp.sourmash('scripts', 'multisearch', query_list, query_list, - '-o', multisearch_csv, "-t", "-0.1", "--ani") + runtmp.sourmash( + "scripts", + "multisearch", + query_list, + query_list, + "-o", + multisearch_csv, + "-t", + "-0.1", + "--ani", + ) assert os.path.exists(multisearch_csv) - runtmp.sourmash('scripts', 'cluster', multisearch_csv, '-o', output, - '--similarity-column', "average_containment_ani", "--cluster-sizes", - sizes, '--threshold', cluster_threshold) + runtmp.sourmash( + "scripts", + "cluster", + multisearch_csv, + "-o", + output, + "--similarity-column", + "average_containment_ani", + "--cluster-sizes", + sizes, + "--threshold", + cluster_threshold, + ) assert os.path.exists(output) # check cluster output - with open(output, mode='r', newline='') as csvfile: + with open(output, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster','nodes'] + assert reader.fieldnames == ["cluster", "nodes"] print(rows) assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" - assert rows[0]['cluster'] == 'Component_1' - expected_node_sets = [set("NC_009661.1;NC_011665.1".split(';')), set("CP001071.1".split(';'))] + assert rows[0]["cluster"] == "Component_1" + expected_node_sets = [ + set("NC_009661.1;NC_011665.1".split(";")), + set("CP001071.1".split(";")), + ] for row in rows: - assert set(row['nodes'].split(';')) in expected_node_sets + assert set(row["nodes"].split(";")) in expected_node_sets # check cluster size histogram - with open(sizes, mode='r', newline='') as csvfile: + with open(sizes, mode="r", newline="") as csvfile: reader = csv.DictReader(csvfile) rows = [row for row in reader] - assert reader.fieldnames == ['cluster_size','count'] + assert reader.fieldnames == ["cluster_size", "count"] assert len(rows) == 2, f"Expected 2 data rows but found {len(rows)}" rows_as_tuples = {tuple(row.values()) for row in rows} - expected = {('1', '1'), ('2', '1')} + expected = {("1", "1"), ("2", "1")} assert rows_as_tuples == expected def test_empty_file(runtmp, capfd): # test with an empty query list - csv = runtmp.output('empty.csv') + csv = runtmp.output("empty.csv") make_file_list(csv, []) - output = runtmp.output('out.csv') - out2 = runtmp.output('counts.csv') + output = runtmp.output("out.csv") + out2 = runtmp.output("counts.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'cluster', csv, - '-o', output, '--cluster-sizes', out2) + runtmp.sourmash( + "scripts", "cluster", csv, "-o", output, "--cluster-sizes", out2 + ) print(runtmp.last_result.err) captured = capfd.readouterr() @@ -470,18 +584,19 @@ def test_empty_file(runtmp, capfd): def test_bad_file(runtmp, capfd): # test with an empty query list - csv = runtmp.output('bad.csv') - with open(csv, 'w') as out: - out.write('column1,column2') + csv = runtmp.output("bad.csv") + with open(csv, "w") as out: + out.write("column1,column2") make_file_list(csv, []) - output = runtmp.output('out.csv') - out2 = runtmp.output('counts.csv') + output = runtmp.output("out.csv") + out2 = runtmp.output("counts.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'cluster', csv, - '-o', output, '--cluster-sizes', out2) + runtmp.sourmash( + "scripts", "cluster", csv, "-o", output, "--cluster-sizes", out2 + ) print(runtmp.last_result.err) captured = capfd.readouterr() diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py index f444818f..99cacf82 100644 --- a/src/python/tests/test_fastgather.py +++ b/src/python/tests/test_fastgather.py @@ -4,44 +4,54 @@ import sourmash from . import sourmash_tst_utils as utils -from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist, - index_siglist) +from .sourmash_tst_utils import ( + get_test_data, + make_file_list, + zip_siglist, + index_siglist, +) def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather') + runtmp.sourmash("scripts", "fastgather") - assert 'usage: fastgather' in runtmp.last_result.err + assert "usage: fastgather" in runtmp.last_result.err -def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage): +def test_simple( + runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage +): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if indexed_query: - query = index_siglist(runtmp, query, runtmp.output('query'), - scaled=100000) + query = index_siglist(runtmp, query, runtmp.output("query"), scaled=100000) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) if indexed_against: - against_list = index_siglist(runtmp, against_list, runtmp.output('db'), - toggle_internal_storage=toggle_internal_storage) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '-s', '100000') + against_list = index_siglist( + runtmp, + against_list, + runtmp.output("db"), + toggle_internal_storage=toggle_internal_storage, + ) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", "fastgather", query, against_list, "-o", g_output, "-s", "100000" + ) assert os.path.exists(g_output) captured = capfd.readouterr() @@ -50,178 +60,278 @@ def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, togg df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) # CTB note: we do not need to worry about this warning for query from a # RocksDB, since there is only one. if indexed_against: - print('indexed against:', indexed_against) - assert "WARNING: loading all sketches from a RocksDB into memory!" in captured.err + print("indexed against:", indexed_against) + assert ( + "WARNING: loading all sketches from a RocksDB into memory!" in captured.err + ) def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db'), - toggle_internal_storage=toggle_internal_storage) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + against_list = index_siglist( + runtmp, + against_list, + runtmp.output("db"), + toggle_internal_storage=toggle_internal_storage, + ) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) assert os.path.exists(g_output) assert os.path.exists(p_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } def test_simple_with_prefetch_list_of_zips(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.sig.zip') - sig63 = get_test_data('63.sig.zip') + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.sig.zip") + sig63 = get_test_data("63.sig.zip") make_file_list(against_list, [sig2, sig47, sig63]) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) assert os.path.exists(g_output) assert os.path.exists(p_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } def test_missing_query(runtmp, capfd, zip_against): # test missing query - query = runtmp.output('no-such-file') - against_list = runtmp.output('against.txt') + query = runtmp.output("no-such-file") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_bad_query(runtmp, capfd, zip_against): # test non-sig query - query = runtmp.output('no-such-file') - against_list = runtmp.output('against.txt') + query = runtmp.output("no-such-file") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") # query doesn't need to be a sig anymore - sig, zip, or pathlist welcome # as long as there's only one sketch that matches params - make_file_list(query, [sig2,sig47]) + make_file_list(query, [sig2, sig47]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: Fastgather requires a single query sketch. Check input:' in captured.err + assert ( + "Error: Fastgather requires a single query sketch. Check input:" in captured.err + ) def test_missing_against(runtmp, capfd, zip_against): # test missing against - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") # don't make against list if zip_against: - against_list = runtmp.output('against.zip') + against_list = runtmp.output("against.zip") - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_sig_against(runtmp, capfd): - # sig file is ok as against file now - query = get_test_data('SRR606249.sig.gz') - - sig2 = get_test_data('2.fa.sig.gz') - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, sig2, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + # sig file is ok as against file now + query = get_test_data("SRR606249.sig.gz") + + sig2 = get_test_data("2.fa.sig.gz") + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + sig2, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) @@ -231,108 +341,158 @@ def test_sig_against(runtmp, capfd): df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a bad filename. - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') - - sig2 = get_test_data('2.fa.sig.gz') - make_file_list(against_list, [sig2, 'no-exist']) - - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") + + sig2 = get_test_data("2.fa.sig.gz") + make_file_list(against_list, [sig2, "no-exist"]) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 search paths failed to load. See error messages above." + in captured.err + ) def test_bad_against_2(runtmp, capfd): # test bad 'against' file - in this case, one containing an empty file - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - empty_file = runtmp.output('empty.sig') - with open(empty_file, 'wb') as fp: + sig2 = get_test_data("2.fa.sig.gz") + empty_file = runtmp.output("empty.sig") + with open(empty_file, "wb") as fp: pass make_file_list(against_list, [sig2, empty_file]) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) - assert "Sketch loading error: File is too short, less than five bytes" in captured.err + assert ( + "Sketch loading error: File is too short, less than five bytes" in captured.err + ) assert "WARNING: could not load sketches from path" in captured.err - assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 search paths failed to load. See error messages above." + in captured.err + ) def test_bad_against_3(runtmp, capfd): # test with a bad against (a .sig.gz file renamed as zip file) - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - against_zip = runtmp.output('against.zip') + sig2 = get_test_data("2.fa.sig.gz") + against_zip = runtmp.output("against.zip") # cp sig2 into against_zip - with open(against_zip, 'wb') as fp: - with open(sig2, 'rb') as fp2: + with open(against_zip, "wb") as fp: + with open(sig2, "rb") as fp2: fp.write(fp2.read()) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', query, against_zip, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_zip, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) - assert 'InvalidArchive' in captured.err + assert "InvalidArchive" in captured.err @pytest.mark.xfail(reason="should work, bug") def test_against_multisigfile(runtmp, zip_against): # test against a sigfile that contains multiple sketches - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - combined = runtmp.output('combined.sig.gz') - runtmp.sourmash('sig', 'cat', sig2, sig47, sig63, '-o', combined) + combined = runtmp.output("combined.sig.gz") + runtmp.sourmash("sig", "cat", sig2, sig47, sig63, "-o", combined) make_file_list(against_list, [combined]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) df = pandas.read_csv(g_output) assert len(df) == 3 print(df) @@ -340,82 +500,111 @@ def test_against_multisigfile(runtmp, zip_against): def test_query_multisigfile(runtmp, capfd, zip_against): # test with a sigfile that contains multiple sketches - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - combined = runtmp.output('combined.sig.gz') - runtmp.sourmash('sig', 'cat', sig2, sig47, sig63, '-o', combined) + combined = runtmp.output("combined.sig.gz") + runtmp.sourmash("sig", "cat", sig2, sig47, sig63, "-o", combined) make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', combined, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + combined, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) # this fails now :) captured = capfd.readouterr() print(captured.err) - assert "Error: Fastgather requires a single query sketch. Check input:" in captured.err + assert ( + "Error: Fastgather requires a single query sketch. Check input:" in captured.err + ) def test_against_nomatch(runtmp, capfd, zip_against): # test with 'against' file containing a non-matching ksize - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig1 = get_test_data('1.fa.k21.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig1, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 search paths - no compatible signatures.' in captured.err + assert "WARNING: skipped 1 search paths - no compatible signatures." in captured.err def test_md5s(runtmp, zip_against): # check that the correct md5sums (of the original sketches) are in # the output files - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -423,9 +612,17 @@ def test_md5s(runtmp, zip_against): df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) - - md5s = list(df['match_md5']) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) + + md5s = list(df["match_md5"]) print(md5s) for against_file in (sig2, sig47, sig63): @@ -438,9 +635,16 @@ def test_md5s(runtmp, zip_against): keys = set(df.keys()) # prefetch output has no rank. - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - - md5s = list(df['match_md5']) + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } + + md5s = list(df["match_md5"]) print(md5s) for against_file in (sig2, sig47, sig63): @@ -450,79 +654,133 @@ def test_md5s(runtmp, zip_against): def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_against): # the column names should be strict subsets of sourmash prefetch cols - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") # first run fastgather - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) assert os.path.exists(g_output) assert os.path.exists(p_output) # now run sourmash prefetch - sp_output = runtmp.output('sourmash-prefetch.csv') - runtmp.sourmash('prefetch', query, against_list, - '-o', sp_output, '--scaled', '100000') + sp_output = runtmp.output("sourmash-prefetch.csv") + runtmp.sourmash( + "prefetch", query, against_list, "-o", sp_output, "--scaled", "100000" + ) gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(g_keys) - g_keys.remove('gather_result_rank') # 'gather_result_rank' is not in sourmash prefetch! + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(g_keys) + g_keys.remove( + "gather_result_rank" + ) # 'gather_result_rank' is not in sourmash prefetch! sourmash_prefetch_df = pandas.read_csv(sp_output) sp_keys = set(sourmash_prefetch_df.keys()) print(g_keys - sp_keys) diff_keys = g_keys - sp_keys - assert diff_keys == set(['unique_intersect_bp', 'median_abund', 'f_match_orig', 'std_abund', 'average_abund', 'f_unique_to_query', 'remaining_bp', 'f_unique_weighted', 'sum_weighted_found', 'total_weighted_hashes', 'n_unique_weighted_found', 'f_orig_query', 'f_match']) + assert diff_keys == set( + [ + "unique_intersect_bp", + "median_abund", + "f_match_orig", + "std_abund", + "average_abund", + "f_unique_to_query", + "remaining_bp", + "f_unique_weighted", + "sum_weighted_found", + "total_weighted_hashes", + "n_unique_weighted_found", + "f_orig_query", + "f_match", + ] + ) def test_fastgather_gatherout_as_picklist(runtmp, zip_against): # should be able to use fastgather gather output as picklist - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") # first run fastgather - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) assert os.path.exists(g_output) assert os.path.exists(p_output) # now run sourmash gather using as picklist as picklist - gather_picklist_output = runtmp.output('sourmash-gather+picklist.csv') - runtmp.sourmash('gather', query, against_list, - '-o', gather_picklist_output, '--scaled', '100000', - '--picklist', f'{g_output}:match_name:ident') + gather_picklist_output = runtmp.output("sourmash-gather+picklist.csv") + runtmp.sourmash( + "gather", + query, + against_list, + "-o", + gather_picklist_output, + "--scaled", + "100000", + "--picklist", + f"{g_output}:match_name:ident", + ) # finally, run sourmash gather using fastgather gather output as picklist - full_gather_output = runtmp.output('sourmash-gather.csv') - runtmp.sourmash('gather', query, against_list, - '-o', full_gather_output, '--scaled', '100000') + full_gather_output = runtmp.output("sourmash-gather.csv") + runtmp.sourmash( + "gather", query, against_list, "-o", full_gather_output, "--scaled", "100000" + ) picklist_df = pandas.read_csv(gather_picklist_output) full_df = pandas.read_csv(full_gather_output) @@ -532,38 +790,56 @@ def test_fastgather_gatherout_as_picklist(runtmp, zip_against): def test_fastgather_prefetchout_as_picklist(runtmp, zip_against): # should be able to use fastgather prefetch output as picklist - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") # first run fastgather - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "fastgather", + query, + against_list, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) assert os.path.exists(g_output) assert os.path.exists(p_output) # now run sourmash gather using fastgather prefetch output as picklist - gather_picklist_output = runtmp.output('sourmash-gather+picklist.csv') - runtmp.sourmash('gather', query, against_list, - '-o', gather_picklist_output, '--scaled', '100000', - '--picklist', f'{p_output}:match_name:ident') + gather_picklist_output = runtmp.output("sourmash-gather+picklist.csv") + runtmp.sourmash( + "gather", + query, + against_list, + "-o", + gather_picklist_output, + "--scaled", + "100000", + "--picklist", + f"{p_output}:match_name:ident", + ) # finally, run sourmash gather using as picklist as picklist - full_gather_output = runtmp.output('sourmash-gather.csv') - runtmp.sourmash('gather', query, against_list, - '-o', full_gather_output, '--scaled', '100000') + full_gather_output = runtmp.output("sourmash-gather.csv") + runtmp.sourmash( + "gather", query, against_list, "-o", full_gather_output, "--scaled", "100000" + ) picklist_df = pandas.read_csv(gather_picklist_output) full_df = pandas.read_csv(full_gather_output) @@ -573,106 +849,188 @@ def test_fastgather_prefetchout_as_picklist(runtmp, zip_against): def test_simple_protein(runtmp): # test basic protein execution - sigs = get_test_data('protein.zip') + sigs = get_test_data("protein.zip") - query = runtmp.output('query.zip') - against = runtmp.output('against.zip') + query = runtmp.output("query.zip") + against = runtmp.output("against.zip") # extract query from zip file - runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) + runtmp.sourmash("sig", "extract", sigs, "--name", "GCA_001593935", "-o", query) # extract against from zip file - runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593925', '-o', against) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against, - '-o', g_output, '-s', '100', '--moltype', 'protein', '-k', '19', - '--threshold', '0') + runtmp.sourmash("sig", "extract", sigs, "--name", "GCA_001593925", "-o", against) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against, + "-o", + g_output, + "-s", + "100", + "--moltype", + "protein", + "-k", + "19", + "--threshold", + "0", + ) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) print(df) - assert df['match_md5'][0] == "16869d2c8a1d29d1c8e56f5c561e585e" + assert df["match_md5"][0] == "16869d2c8a1d29d1c8e56f5c561e585e" def test_simple_dayhoff(runtmp): # test basic protein execution - sigs = get_test_data('dayhoff.zip') + sigs = get_test_data("dayhoff.zip") - query = runtmp.output('query.zip') - against = runtmp.output('against.zip') + query = runtmp.output("query.zip") + against = runtmp.output("against.zip") # extract query from zip file - runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) + runtmp.sourmash("sig", "extract", sigs, "--name", "GCA_001593935", "-o", query) # extract against from zip file - runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593925', '-o', against) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against, - '-o', g_output, '-s', '100', '--moltype', 'dayhoff', '-k', '19', - '--threshold', '0') + runtmp.sourmash("sig", "extract", sigs, "--name", "GCA_001593925", "-o", against) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against, + "-o", + g_output, + "-s", + "100", + "--moltype", + "dayhoff", + "-k", + "19", + "--threshold", + "0", + ) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) print(df) - assert df['match_md5'][0] == "fbca5e5211e4d58427997fd5c8343e9a" + assert df["match_md5"][0] == "fbca5e5211e4d58427997fd5c8343e9a" def test_simple_hp(runtmp): # test basic protein execution - sigs = get_test_data('hp.zip') + sigs = get_test_data("hp.zip") - query = runtmp.output('query.zip') - against = runtmp.output('against.zip') + query = runtmp.output("query.zip") + against = runtmp.output("against.zip") # extract query from zip file - runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) + runtmp.sourmash("sig", "extract", sigs, "--name", "GCA_001593935", "-o", query) # extract against from zip file - runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593925', '-o', against) - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, against, - '-o', g_output, '-s', '100', '--moltype', 'hp', '-k', '19', - '--threshold', '0') + runtmp.sourmash("sig", "extract", sigs, "--name", "GCA_001593925", "-o", against) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + against, + "-o", + g_output, + "-s", + "100", + "--moltype", + "hp", + "-k", + "19", + "--threshold", + "0", + ) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) print(df) - assert df['match_md5'][0] == "ea2a1ad233c2908529d124a330bcb672" + assert df["match_md5"][0] == "ea2a1ad233c2908529d124a330bcb672" def test_indexed_against(runtmp, capfd): # accept rocksdb against, but with a warning - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") make_file_list(against_list, [sig2]) - db_against = runtmp.output('against.rocksdb') + db_against = runtmp.output("against.rocksdb") ## index against - runtmp.sourmash('scripts', 'index', against_list, - '-o', db_against, '-k', str(31), '--scaled', str(1000), - '--moltype', "DNA") - - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query, db_against, - '-o', g_output, '--output-prefetch', p_output, - '-s', '100000') + runtmp.sourmash( + "scripts", + "index", + against_list, + "-o", + db_against, + "-k", + str(31), + "--scaled", + str(1000), + "--moltype", + "DNA", + ) + + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query, + db_against, + "-o", + g_output, + "--output-prefetch", + p_output, + "-s", + "100000", + ) df = pandas.read_csv(g_output) assert len(df) == 1 @@ -685,12 +1043,12 @@ def test_indexed_against(runtmp, capfd): def test_simple_with_manifest_loading(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) query_manifest = runtmp.output("query-manifest.csv") @@ -699,35 +1057,52 @@ def test_simple_with_manifest_loading(runtmp): runtmp.sourmash("sig", "manifest", query, "-o", query_manifest) runtmp.sourmash("sig", "manifest", against_list, "-o", against_manifest) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') - - runtmp.sourmash('scripts', 'fastgather', query_manifest, against_manifest, - '-o', g_output, '-s', '100000') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") + + runtmp.sourmash( + "scripts", + "fastgather", + query_manifest, + against_manifest, + "-o", + g_output, + "-s", + "100000", + ) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) def test_simple_full_output(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) - g_output = runtmp.output('gather.csv') - p_output = runtmp.output('prefetch.csv') + g_output = runtmp.output("gather.csv") + p_output = runtmp.output("prefetch.csv") - runtmp.sourmash('scripts', 'fastgather', query, against_list, - '-o', g_output, '-s', '100000') + runtmp.sourmash( + "scripts", "fastgather", query, against_list, "-o", g_output, "-s", "100000" + ) assert os.path.exists(g_output) df = pandas.read_csv(g_output) @@ -735,27 +1110,61 @@ def test_simple_full_output(runtmp): keys = set(df.keys()) print(keys) print(df) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert keys == expected_keys - md5s = set(df['match_md5']) + md5s = set(df["match_md5"]) for against_file in (sig2, sig47, sig63): for ss in sourmash.load_file_as_signatures(against_file, ksize=31): assert ss.md5sum() in md5s - intersect_bp = set(df['intersect_bp']) + intersect_bp = set(df["intersect_bp"]) assert intersect_bp == set([4400000, 4100000, 2200000]) - f_unique_to_query = set([round(x,4) for x in df['f_unique_to_query']]) + f_unique_to_query = set([round(x, 4) for x in df["f_unique_to_query"]]) assert f_unique_to_query == set([0.0052, 0.0105, 0.0043]) - query_containment_ani = set([round(x,4) for x in df['query_containment_ani']]) - assert query_containment_ani == { 0.8442, 0.8613, 0.8632 } + query_containment_ani = set([round(x, 4) for x in df["query_containment_ani"]]) + assert query_containment_ani == {0.8442, 0.8613, 0.8632} print(query_containment_ani) for index, row in df.iterrows(): print(row.to_dict()) @@ -763,29 +1172,39 @@ def test_simple_full_output(runtmp): def test_fullres_vs_sourmash_gather(runtmp): # fastgather results should match to sourmash gather results - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") make_file_list(against_list, [sig2, sig47, sig63]) - g_output = runtmp.output('SRR606249.gather.csv') - runtmp.sourmash('scripts', 'fastgather', query_list, - against_list, '-s', '100000', '-t', '0', - '-o', g_output) + g_output = runtmp.output("SRR606249.gather.csv") + runtmp.sourmash( + "scripts", + "fastgather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) assert os.path.exists(g_output) # now run sourmash gather - sg_output = runtmp.output('.csv') - runtmp.sourmash('gather', query, against_list, - '-o', sg_output, '--scaled', '100000') + sg_output = runtmp.output(".csv") + runtmp.sourmash( + "gather", query, against_list, "-o", sg_output, "--scaled", "100000" + ) gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) @@ -794,88 +1213,115 @@ def test_fullres_vs_sourmash_gather(runtmp): sg_keys = set(sourmash_gather_df.keys()) print(sg_keys) modified_keys = ["match_md5", "match_name", "match_filename"] - sg_keys.update(modified_keys) # fastgather is more explicit (match_md5 instead of md5, etc) - print('g_keys - sg_keys:', g_keys - sg_keys) + sg_keys.update( + modified_keys + ) # fastgather is more explicit (match_md5 instead of md5, etc) + print("g_keys - sg_keys:", g_keys - sg_keys) assert not g_keys - sg_keys, g_keys - sg_keys for _idx, row in sourmash_gather_df.iterrows(): print(row.to_dict()) - fg_intersect_bp = set(gather_df['intersect_bp']) - g_intersect_bp = set(sourmash_gather_df['intersect_bp']) + fg_intersect_bp = set(gather_df["intersect_bp"]) + g_intersect_bp = set(sourmash_gather_df["intersect_bp"]) assert fg_intersect_bp == g_intersect_bp == set([4400000, 4100000, 2200000]) - fg_f_orig_query = set([round(x,4) for x in gather_df['f_orig_query']]) - g_f_orig_query = set([round(x,4) for x in sourmash_gather_df['f_orig_query']]) + fg_f_orig_query = set([round(x, 4) for x in gather_df["f_orig_query"]]) + g_f_orig_query = set([round(x, 4) for x in sourmash_gather_df["f_orig_query"]]) assert fg_f_orig_query == g_f_orig_query == set([0.0098, 0.0105, 0.0052]) - fg_f_match = set([round(x,4) for x in gather_df['f_match']]) - g_f_match = set([round(x,4) for x in sourmash_gather_df['f_match']]) + fg_f_match = set([round(x, 4) for x in gather_df["f_match"]]) + g_f_match = set([round(x, 4) for x in sourmash_gather_df["f_match"]]) assert fg_f_match == g_f_match == set([0.439, 1.0]) - fg_f_unique_to_query = set([round(x,3) for x in gather_df['f_unique_to_query']]) # rounding to 4 --> slightly different! - g_f_unique_to_query = set([round(x,3) for x in sourmash_gather_df['f_unique_to_query']]) + fg_f_unique_to_query = set( + [round(x, 3) for x in gather_df["f_unique_to_query"]] + ) # rounding to 4 --> slightly different! + g_f_unique_to_query = set( + [round(x, 3) for x in sourmash_gather_df["f_unique_to_query"]] + ) assert fg_f_unique_to_query == g_f_unique_to_query == set([0.004, 0.01, 0.005]) - fg_f_unique_weighted = set([round(x,4) for x in gather_df['f_unique_weighted']]) - g_f_unique_weighted = set([round(x,4) for x in sourmash_gather_df['f_unique_weighted']]) - assert fg_f_unique_weighted== g_f_unique_weighted == set([0.0063, 0.002, 0.0062]) - - fg_average_abund = set([round(x,4) for x in gather_df['average_abund']]) - g_average_abund = set([round(x,4) for x in sourmash_gather_df['average_abund']]) - assert fg_average_abund== g_average_abund == set([8.2222, 10.3864, 21.0455]) - - fg_median_abund = set([round(x,4) for x in gather_df['median_abund']]) - g_median_abund = set([round(x,4) for x in sourmash_gather_df['median_abund']]) - assert fg_median_abund== g_median_abund == set([8.0, 10.5, 21.5]) - - fg_std_abund = set([round(x,4) for x in gather_df['std_abund']]) - g_std_abund = set([round(x,4) for x in sourmash_gather_df['std_abund']]) - assert fg_std_abund== g_std_abund == set([3.172, 5.6446, 6.9322]) - - g_match_filename_basename = [os.path.basename(filename) for filename in sourmash_gather_df['filename']] - fg_match_filename_basename = [os.path.basename(filename) for filename in gather_df['match_filename']] - assert all([x in fg_match_filename_basename for x in ['2.fa.sig.gz', '63.fa.sig.gz', '47.fa.sig.gz']]) + fg_f_unique_weighted = set([round(x, 4) for x in gather_df["f_unique_weighted"]]) + g_f_unique_weighted = set( + [round(x, 4) for x in sourmash_gather_df["f_unique_weighted"]] + ) + assert fg_f_unique_weighted == g_f_unique_weighted == set([0.0063, 0.002, 0.0062]) + + fg_average_abund = set([round(x, 4) for x in gather_df["average_abund"]]) + g_average_abund = set([round(x, 4) for x in sourmash_gather_df["average_abund"]]) + assert fg_average_abund == g_average_abund == set([8.2222, 10.3864, 21.0455]) + + fg_median_abund = set([round(x, 4) for x in gather_df["median_abund"]]) + g_median_abund = set([round(x, 4) for x in sourmash_gather_df["median_abund"]]) + assert fg_median_abund == g_median_abund == set([8.0, 10.5, 21.5]) + + fg_std_abund = set([round(x, 4) for x in gather_df["std_abund"]]) + g_std_abund = set([round(x, 4) for x in sourmash_gather_df["std_abund"]]) + assert fg_std_abund == g_std_abund == set([3.172, 5.6446, 6.9322]) + + g_match_filename_basename = [ + os.path.basename(filename) for filename in sourmash_gather_df["filename"] + ] + fg_match_filename_basename = [ + os.path.basename(filename) for filename in gather_df["match_filename"] + ] + assert all( + [ + x in fg_match_filename_basename + for x in ["2.fa.sig.gz", "63.fa.sig.gz", "47.fa.sig.gz"] + ] + ) assert fg_match_filename_basename == g_match_filename_basename - assert list(sourmash_gather_df['name']) == list(gather_df['match_name']) - assert list(sourmash_gather_df['md5']) == list(gather_df['match_md5']) + assert list(sourmash_gather_df["name"]) == list(gather_df["match_name"]) + assert list(sourmash_gather_df["md5"]) == list(gather_df["match_md5"]) - fg_f_match_orig = set([round(x,4) for x in gather_df['f_match_orig']]) - g_f_match_orig = set([round(x,4) for x in sourmash_gather_df['f_match_orig']]) + fg_f_match_orig = set([round(x, 4) for x in gather_df["f_match_orig"]]) + g_f_match_orig = set([round(x, 4) for x in sourmash_gather_df["f_match_orig"]]) assert fg_f_match_orig == g_f_match_orig == set([1.0]) - fg_unique_intersect_bp = set(gather_df['unique_intersect_bp']) - g_unique_intersect_bp = set(sourmash_gather_df['unique_intersect_bp']) - assert fg_unique_intersect_bp == g_unique_intersect_bp == set([4400000, 1800000, 2200000]) + fg_unique_intersect_bp = set(gather_df["unique_intersect_bp"]) + g_unique_intersect_bp = set(sourmash_gather_df["unique_intersect_bp"]) + assert ( + fg_unique_intersect_bp + == g_unique_intersect_bp + == set([4400000, 1800000, 2200000]) + ) + + fg_gather_result_rank = set(gather_df["gather_result_rank"]) + g_gather_result_rank = set(sourmash_gather_df["gather_result_rank"]) + assert fg_gather_result_rank == g_gather_result_rank == set([0, 1, 2]) - fg_gather_result_rank= set(gather_df['gather_result_rank']) - g_gather_result_rank = set(sourmash_gather_df['gather_result_rank']) - assert fg_gather_result_rank == g_gather_result_rank == set([0,1,2]) - - fg_remaining_bp = list(gather_df['remaining_bp']) + fg_remaining_bp = list(gather_df["remaining_bp"]) assert fg_remaining_bp == [415600000, 413400000, 411600000] ### Gather remaining bp does not match, but I think this one is right? - #g_remaining_bp = list(sourmash_gather_df['remaining_bp']) - #print("gather remaining bp: ", g_remaining_bp) #{4000000, 0, 1800000} + # g_remaining_bp = list(sourmash_gather_df['remaining_bp']) + # print("gather remaining bp: ", g_remaining_bp) #{4000000, 0, 1800000} # assert fg_remaining_bp == g_remaining_bp == set([]) - - fg_query_containment_ani = set([round(x,3) for x in gather_df['query_containment_ani']]) - g_query_containment_ani = set([round(x,3) for x in sourmash_gather_df['query_containment_ani']]) + + fg_query_containment_ani = set( + [round(x, 3) for x in gather_df["query_containment_ani"]] + ) + g_query_containment_ani = set( + [round(x, 3) for x in sourmash_gather_df["query_containment_ani"]] + ) assert fg_query_containment_ani == {0.844, 0.861, 0.863} # gather cANI are nans here -- perhaps b/c sketches too small? # assert fg_query_containment_ani == g_query_containment_ani == set([0.8632, 0.8444, 0.8391]) print("fg qcANI: ", fg_query_containment_ani) print("g_qcANI: ", g_query_containment_ani) - fg_n_unique_weighted_found= set(gather_df['n_unique_weighted_found']) - g_n_unique_weighted_found = set(sourmash_gather_df['n_unique_weighted_found']) - assert fg_n_unique_weighted_found == g_n_unique_weighted_found == set([457, 148, 463]) + fg_n_unique_weighted_found = set(gather_df["n_unique_weighted_found"]) + g_n_unique_weighted_found = set(sourmash_gather_df["n_unique_weighted_found"]) + assert ( + fg_n_unique_weighted_found == g_n_unique_weighted_found == set([457, 148, 463]) + ) - fg_sum_weighted_found= set(gather_df['sum_weighted_found']) - g_sum_weighted_found = set(sourmash_gather_df['sum_weighted_found']) + fg_sum_weighted_found = set(gather_df["sum_weighted_found"]) + g_sum_weighted_found = set(sourmash_gather_df["sum_weighted_found"]) assert fg_sum_weighted_found == g_sum_weighted_found == set([920, 457, 1068]) - - fg_total_weighted_hashes= set(gather_df['total_weighted_hashes']) - g_total_weighted_hashes = set(sourmash_gather_df['total_weighted_hashes']) + + fg_total_weighted_hashes = set(gather_df["total_weighted_hashes"]) + g_total_weighted_hashes = set(sourmash_gather_df["total_weighted_hashes"]) assert fg_total_weighted_hashes == g_total_weighted_hashes == set([73489]) diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index 72d81aac..522cf17e 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -1,6 +1,7 @@ """ Test 'sourmash scripts fastmultigather' """ + import os import pytest import pandas @@ -8,170 +9,253 @@ import sourmash from . import sourmash_tst_utils as utils -from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist, - index_siglist) +from .sourmash_tst_utils import ( + get_test_data, + make_file_list, + zip_siglist, + index_siglist, +) def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather') + runtmp.sourmash("scripts", "fastmultigather") - assert 'usage: fastmultigather' in runtmp.last_result.err + assert "usage: fastmultigather" in runtmp.last_result.err def test_simple(runtmp, zip_against): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) - - - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0', in_directory=runtmp.output('')) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) + + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + in_directory=runtmp.output(""), + ) + + print(os.listdir(runtmp.output(""))) + + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") assert os.path.exists(p_output) # check prefetch output (only non-indexed gather) df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } assert os.path.exists(g_output) df = pandas.read_csv(g_output) print(df) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) def test_simple_list_of_zips(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.sig.zip') - sig63 = get_test_data('63.sig.zip') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.sig.zip") + sig63 = get_test_data("63.sig.zip") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) cwd = os.getcwd() try: - os.chdir(runtmp.output('')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0') + os.chdir(runtmp.output("")) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + ) finally: os.chdir(cwd) - print(os.listdir(runtmp.output(''))) + print(os.listdir(runtmp.output(""))) - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") assert os.path.exists(p_output) # check prefetch output (only non-indexed gather) df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } assert os.path.exists(g_output) df = pandas.read_csv(g_output) print(df) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) def test_simple_space_in_signame(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - renamed_query = runtmp.output('in.zip') - name = 'my-favorite-signame has spaces' + query = get_test_data("SRR606249.sig.gz") + renamed_query = runtmp.output("in.zip") + name = "my-favorite-signame has spaces" # rename signature - runtmp.sourmash('sig', 'rename', query, name, '-o', renamed_query) + runtmp.sourmash("sig", "rename", query, name, "-o", renamed_query) - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") make_file_list(against_list, [sig2, sig47, sig63]) - runtmp.sourmash('scripts', 'fastmultigather', renamed_query, against_list, - '-s', '100000', '-t', '0', in_directory=runtmp.output('')) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('my-favorite-signame.gather.csv') - p_output = runtmp.output('my-favorite-signame.prefetch.csv') + runtmp.sourmash( + "scripts", + "fastmultigather", + renamed_query, + against_list, + "-s", + "100000", + "-t", + "0", + in_directory=runtmp.output(""), + ) + + print(os.listdir(runtmp.output(""))) + + g_output = runtmp.output("my-favorite-signame.gather.csv") + p_output = runtmp.output("my-favorite-signame.prefetch.csv") assert os.path.exists(p_output) assert os.path.exists(g_output) def test_simple_zip_query(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0', in_directory=runtmp.output('') ) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + in_directory=runtmp.output(""), + ) - print(os.listdir(runtmp.output(''))) + print(os.listdir(runtmp.output(""))) - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) def test_simple_read_manifests(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") against_mf = runtmp.output("against.csv") query_mf = runtmp.output("query.csv") @@ -180,148 +264,263 @@ def test_simple_read_manifests(runtmp): runtmp.sourmash("sig", "manifest", query, "-o", query_mf) runtmp.sourmash("sig", "manifest", against_list, "-o", against_mf) - runtmp.sourmash('scripts', 'fastmultigather', query_mf, against_list, - '-s', '100000', '-t', '0', in_directory=runtmp.output('')) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_mf, + against_list, + "-s", + "100000", + "-t", + "0", + in_directory=runtmp.output(""), + ) - print(os.listdir(runtmp.output(''))) + print(os.listdir(runtmp.output(""))) - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) def test_simple_indexed(runtmp, zip_query, toggle_internal_storage): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) - - g_output = runtmp.output('out.csv') - against_db = index_siglist(runtmp, against_list, runtmp.output('test.rocksdb'), toggle_internal_storage=toggle_internal_storage) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_db, '-s', '100000', '-t', '0', - '-o', g_output) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + + g_output = runtmp.output("out.csv") + against_db = index_siglist( + runtmp, + against_list, + runtmp.output("test.rocksdb"), + toggle_internal_storage=toggle_internal_storage, + ) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_db, + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + ) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} - assert keys == expected_keys + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } + assert keys == expected_keys def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_mf = runtmp.output('query.csv') - against_list = runtmp.output('against.txt') + query_mf = runtmp.output("query.csv") + against_list = runtmp.output("against.txt") make_file_list(against_list, [sig2, sig47, sig63]) runtmp.sourmash("sig", "manifest", query, "-o", query_mf) - g_output = runtmp.output('out.csv') - against_db = index_siglist(runtmp, against_list, runtmp.output('db'), - toggle_internal_storage=toggle_internal_storage) - runtmp.sourmash('scripts', 'fastmultigather', query_mf, - against_db, '-s', '100000', '-t', '0', - '-o', g_output) + g_output = runtmp.output("out.csv") + against_db = index_siglist( + runtmp, + against_list, + runtmp.output("db"), + toggle_internal_storage=toggle_internal_storage, + ) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_mf, + against_db, + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + ) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} - assert keys == expected_keys + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } + assert keys == expected_keys def test_missing_querylist(runtmp, capfd, indexed, zip_query, toggle_internal_storage): # test missing querylist - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") if zip_query: - query_list = runtmp.output('query.zip') + query_list = runtmp.output("query.zip") # do not make query_list! make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db'), - toggle_internal_storage=toggle_internal_storage) + against_list = index_siglist( + runtmp, + against_list, + runtmp.output("db"), + toggle_internal_storage=toggle_internal_storage, + ) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_list, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_sig_query(runtmp, capfd, indexed): # sig file is now fine as a query - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - g_output = runtmp.output('out.csv') - output_params = ['-o', g_output] + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) + g_output = runtmp.output("out.csv") + output_params = ["-o", g_output] else: - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") output_params = [] - runtmp.sourmash('scripts', 'fastmultigather', query, against_list, - '-s', '100000', *output_params) + runtmp.sourmash( + "scripts", + "fastmultigather", + query, + against_list, + "-s", + "100000", + *output_params, + ) captured = capfd.readouterr() print(captured.err) @@ -331,7 +530,14 @@ def test_sig_query(runtmp, capfd, indexed): df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + }.issubset(keys) # check gather output (both) assert os.path.exists(g_output) @@ -339,32 +545,47 @@ def test_sig_query(runtmp, capfd, indexed): assert len(df) == 3 keys = set(df.keys()) if indexed: - assert {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match', 'intersect_bp'}.issubset(keys) + assert { + "query_name", + "query_md5", + "match_name", + "match_md5", + "f_match", + "intersect_bp", + }.issubset(keys) else: - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(keys) def test_bad_query(runtmp, capfd, indexed): # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_zip = runtmp.output('query.zip') + query_zip = runtmp.output("query.zip") # cp sig2 into query_zip - with open(query_zip, 'wb') as fp: - with open(sig2, 'rb') as fp2: + with open(query_zip, "wb") as fp: + with open(sig2, "rb") as fp2: fp.write(fp2.read()) make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_zip, against_list) + runtmp.sourmash("scripts", "fastmultigather", query_zip, against_list) captured = capfd.readouterr() print(captured.err) @@ -374,21 +595,22 @@ def test_bad_query(runtmp, capfd, indexed): def test_missing_query(runtmp, capfd, indexed): # test missing query - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - make_file_list(query_list, [sig2, 'no-exist']) + make_file_list(query_list, [sig2, "no-exist"]) make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_list, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) @@ -398,24 +620,25 @@ def test_missing_query(runtmp, capfd, indexed): def test_nomatch_query(runtmp, capfd, indexed, zip_query): # test nomatch file in querylist - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - badsig1 = get_test_data('1.fa.k21.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + badsig1 = get_test_data("1.fa.k21.sig.gz") make_file_list(query_list, [sig2, badsig1]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_list, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) @@ -424,40 +647,40 @@ def test_nomatch_query(runtmp, capfd, indexed, zip_query): def test_missing_against(runtmp, capfd, zip_against): # test missing against - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) if zip_against: - against_list = runtmp.output('against.zip') + against_list = runtmp.output("against.zip") # do not make against_list with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_list, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_sig_against(runtmp, capfd): # against file can be a sig now - query = get_test_data('SRR606249.sig.gz') - against_list = runtmp.output('against.txt') + query = get_test_data("SRR606249.sig.gz") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') - runtmp.sourmash('scripts', 'fastmultigather', query, sig2, - '-s', '100000') + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") + runtmp.sourmash("scripts", "fastmultigather", query, sig2, "-s", "100000") captured = capfd.readouterr() print(captured.err) @@ -467,75 +690,96 @@ def test_sig_against(runtmp, capfd): df = pandas.read_csv(p_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + }.issubset(keys) # check gather output assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a nonexistent file - query = get_test_data('SRR606249.sig.gz') - query_list = runtmp.output('query.txt') + query = get_test_data("SRR606249.sig.gz") + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') - sig2 = get_test_data('2.fa.sig.gz') + against_list = runtmp.output("against.txt") + sig2 = get_test_data("2.fa.sig.gz") make_file_list(against_list, [sig2, "no exist"]) # should succeed, but with error output. - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_list, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) assert "WARNING: could not load sketches from path 'no exist'" in captured.err - assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 search paths failed to load. See error messages above." + in captured.err + ) def test_bad_against_2(runtmp, capfd, zip_query): # test with a bad against (a .sig.gz file renamed as zip file) - query = get_test_data('SRR606249.sig.gz') - query_list = runtmp.output('query.txt') + query = get_test_data("SRR606249.sig.gz") + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - sig2 = get_test_data('2.fa.sig.gz') - against_zip = runtmp.output('against.zip') + sig2 = get_test_data("2.fa.sig.gz") + against_zip = runtmp.output("against.zip") # cp sig2 into query_zip - with open(against_zip, 'wb') as fp: - with open(sig2, 'rb') as fp2: + with open(against_zip, "wb") as fp: + with open(sig2, "rb") as fp2: fp.write(fp2.read()) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_zip, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_zip, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) - assert 'InvalidArchive' in captured.err + assert "InvalidArchive" in captured.err def test_empty_against(runtmp, capfd): # test bad 'against' file - in this case, an empty one - query = get_test_data('SRR606249.sig.gz') - query_list = runtmp.output('query.txt') + query = get_test_data("SRR606249.sig.gz") + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") make_file_list(against_list, []) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_list, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) @@ -546,60 +790,77 @@ def test_empty_against(runtmp, capfd): def test_nomatch_in_against(runtmp, capfd, zip_against): # test an against file that has a non-matching ksize sig in it - query = get_test_data('SRR606249.sig.gz') - query_list = runtmp.output('query.txt') + query = get_test_data("SRR606249.sig.gz") + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig1 = get_test_data('1.fa.k21.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig1 = get_test_data("1.fa.k21.sig.gz") make_file_list(against_list, [sig2, sig1]) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000') + runtmp.sourmash( + "scripts", "fastmultigather", query_list, against_list, "-s", "100000" + ) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 search paths - no compatible signatures.' in captured.err + assert "WARNING: skipped 1 search paths - no compatible signatures." in captured.err def test_md5(runtmp, zip_query): # test correct md5s present in output - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0', in_directory=runtmp.output('')) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + in_directory=runtmp.output(""), + ) - print(os.listdir(runtmp.output(''))) + print(os.listdir(runtmp.output(""))) - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) df = pandas.read_csv(p_output) assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - - md5s = set(df['match_md5']) + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } + + md5s = set(df["match_md5"]) for against_file in (sig2, sig47, sig63): for ss in sourmash.load_file_as_signatures(against_file, ksize=31): assert ss.md5sum() in md5s @@ -609,9 +870,16 @@ def test_md5(runtmp, zip_query): df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}.issubset(keys) - - md5s = set(df['match_md5']) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + }.issubset(keys) + + md5s = set(df["match_md5"]) for against_file in (sig2, sig47, sig63): for ss in sourmash.load_file_as_signatures(against_file, ksize=31): assert ss.md5sum() in md5s @@ -619,41 +887,76 @@ def test_md5(runtmp, zip_query): def test_md5_indexed(runtmp, zip_query): # test correct md5s present in output - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) - - g_output = runtmp.output('out.csv') - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_list, '-s', '100000', '-t', '0', - '-o', g_output) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + + g_output = runtmp.output("out.csv") + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + ) # check gather output (mostly same for indexed vs non-indexed version) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert keys == expected_keys - md5s = set(df['match_md5']) + md5s = set(df["match_md5"]) for against_file in (sig2, sig47, sig63): for ss in sourmash.load_file_as_signatures(against_file, ksize=31): assert ss.md5sum() in md5s @@ -661,147 +964,266 @@ def test_md5_indexed(runtmp, zip_query): def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_query, zip_against): # the column names should be strict subsets of sourmash prefetch cols - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") make_file_list(against_list, [sig2, sig47, sig63]) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) - - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0', in_directory=runtmp.output('')) - - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) + + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + in_directory=runtmp.output(""), + ) + + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") assert os.path.exists(p_output) assert os.path.exists(g_output) # now run sourmash prefetch - sp_output = runtmp.output('sourmash-prefetch.csv') - runtmp.sourmash('prefetch', query, against_list, - '-o', sp_output, '--scaled', '100000') + sp_output = runtmp.output("sourmash-prefetch.csv") + runtmp.sourmash( + "prefetch", query, against_list, "-o", sp_output, "--scaled", "100000" + ) gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(g_keys) - g_keys.remove('gather_result_rank') # 'rank' is not in sourmash prefetch! + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "gather_result_rank", + "intersect_bp", + }.issubset(g_keys) + g_keys.remove("gather_result_rank") # 'rank' is not in sourmash prefetch! sourmash_prefetch_df = pandas.read_csv(sp_output) sp_keys = set(sourmash_prefetch_df.keys()) print(g_keys - sp_keys) diff_keys = g_keys - sp_keys - assert diff_keys == set(['remaining_bp', 'f_match_orig', 'f_unique_weighted', 'average_abund', 'unique_intersect_bp', 'std_abund', 'sum_weighted_found', 'median_abund', 'n_unique_weighted_found', 'f_unique_to_query', 'f_orig_query', 'total_weighted_hashes', 'f_match']) + assert diff_keys == set( + [ + "remaining_bp", + "f_match_orig", + "f_unique_weighted", + "average_abund", + "unique_intersect_bp", + "std_abund", + "sum_weighted_found", + "median_abund", + "n_unique_weighted_found", + "f_unique_to_query", + "f_orig_query", + "total_weighted_hashes", + "f_match", + ] + ) + def test_csv_columns_vs_sourmash_gather_fullresults(runtmp): # the column names should be identical to sourmash gather cols - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") make_file_list(against_list, [sig2, sig47, sig63]) - g_output = runtmp.output('SRR606249.gather.csv') - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_list, '-s', '100000', '-t', '0', - ) # '-o', g_output, + g_output = runtmp.output("SRR606249.gather.csv") + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + ) # '-o', g_output, assert os.path.exists(g_output) # now run sourmash gather - sg_output = runtmp.output('.csv') - runtmp.sourmash('gather', query, against_list, - '-o', sg_output, '--scaled', '100000') + sg_output = runtmp.output(".csv") + runtmp.sourmash( + "gather", query, against_list, "-o", sg_output, "--scaled", "100000" + ) gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert g_keys == expected_keys sourmash_gather_df = pandas.read_csv(sg_output) sg_keys = set(sourmash_gather_df.keys()) print(sg_keys) modified_keys = ["match_md5", "match_name", "match_filename"] - sg_keys.update(modified_keys) # fastmultigather is more explicit (match_md5 instead of md5, etc) - print('g_keys - sg_keys:', g_keys - sg_keys) + sg_keys.update( + modified_keys + ) # fastmultigather is more explicit (match_md5 instead of md5, etc) + print("g_keys - sg_keys:", g_keys - sg_keys) assert not g_keys - sg_keys, g_keys - sg_keys def test_csv_columns_vs_sourmash_gather_indexed(runtmp): # the column names should be identical to sourmash gather cols - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") make_file_list(against_list, [sig2, sig47, sig63]) - g_output = runtmp.output('out.csv') - against_db = index_siglist(runtmp, against_list, runtmp.output('db')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_db, '-s', '100000', '-t', '0', - '-o', g_output) + g_output = runtmp.output("out.csv") + against_db = index_siglist(runtmp, against_list, runtmp.output("db")) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_db, + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + ) assert os.path.exists(g_output) # now run sourmash gather - sg_output = runtmp.output('sourmash-gather.csv') - runtmp.sourmash('gather', query, against_list, - '-o', sg_output, '--scaled', '100000') + sg_output = runtmp.output("sourmash-gather.csv") + runtmp.sourmash( + "gather", query, against_list, "-o", sg_output, "--scaled", "100000" + ) gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert g_keys == expected_keys sourmash_gather_df = pandas.read_csv(sg_output) sg_keys = set(sourmash_gather_df.keys()) print(sg_keys) modified_keys = ["match_md5", "match_name", "match_filename"] - sg_keys.update(modified_keys) # fastmultigather is more explicit (match_md5 instead of md5, etc) - print('g_keys - sg_keys:', g_keys - sg_keys) + sg_keys.update( + modified_keys + ) # fastmultigather is more explicit (match_md5 instead of md5, etc) + print("g_keys - sg_keys:", g_keys - sg_keys) assert not g_keys - sg_keys, g_keys - sg_keys def test_simple_protein(runtmp): # test basic protein execution - sigs = get_test_data('protein.zip') + sigs = get_test_data("protein.zip") sig_names = ["GCA_001593935", "GCA_001593925"] - runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, - '-s', '100', '--moltype', 'protein', '-k', '19') + runtmp.sourmash( + "scripts", + "fastmultigather", + sigs, + sigs, + "-s", + "100", + "--moltype", + "protein", + "-k", + "19", + ) for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + ".gather.csv")) + p_output = runtmp.output(os.path.join(qsig + ".prefetch.csv")) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -809,24 +1231,42 @@ def test_simple_protein(runtmp): df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) print(df) # since we're just matching to identical sigs, the md5s should be the same - assert df['query_md5'][0] == df['match_md5'][0] + assert df["query_md5"][0] == df["match_md5"][0] def test_simple_dayhoff(runtmp): # test basic protein execution - sigs = get_test_data('dayhoff.zip') + sigs = get_test_data("dayhoff.zip") sig_names = ["GCA_001593935", "GCA_001593925"] - runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, - '-s', '100', '--moltype', 'dayhoff', '-k', '19') + runtmp.sourmash( + "scripts", + "fastmultigather", + sigs, + sigs, + "-s", + "100", + "--moltype", + "dayhoff", + "-k", + "19", + ) for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + ".gather.csv")) + p_output = runtmp.output(os.path.join(qsig + ".prefetch.csv")) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -834,24 +1274,42 @@ def test_simple_dayhoff(runtmp): df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) print(df) # since we're just matching to identical sigs, the md5s should be the same - assert df['query_md5'][0] == df['match_md5'][0] + assert df["query_md5"][0] == df["match_md5"][0] def test_simple_hp(runtmp): # test basic protein execution - sigs = get_test_data('hp.zip') + sigs = get_test_data("hp.zip") sig_names = ["GCA_001593935", "GCA_001593925"] - runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, - '-s', '100', '--moltype', 'hp', '-k', '19') + runtmp.sourmash( + "scripts", + "fastmultigather", + sigs, + sigs, + "-s", + "100", + "--moltype", + "hp", + "-k", + "19", + ) for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + ".gather.csv")) + p_output = runtmp.output(os.path.join(qsig + ".prefetch.csv")) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -859,173 +1317,342 @@ def test_simple_hp(runtmp): df = pandas.read_csv(g_output) assert len(df) == 1 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) print(df) # since we're just matching to identical sigs, the md5s should be the same - assert df['query_md5'][0] == df['match_md5'][0] + assert df["query_md5"][0] == df["match_md5"][0] def test_simple_protein_indexed(runtmp): # test basic protein execution - sigs = get_test_data('protein.zip') - - sigs_db = index_siglist(runtmp, sigs, runtmp.output('db'), ksize=19, moltype='protein', scaled=100) - - out_csv = runtmp.output('out.csv') - runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs_db, - '-s', '100', '--moltype', 'protein', '-k', '19', - '-o', out_csv) + sigs = get_test_data("protein.zip") + + sigs_db = index_siglist( + runtmp, sigs, runtmp.output("db"), ksize=19, moltype="protein", scaled=100 + ) + + out_csv = runtmp.output("out.csv") + runtmp.sourmash( + "scripts", + "fastmultigather", + sigs, + sigs_db, + "-s", + "100", + "--moltype", + "protein", + "-k", + "19", + "-o", + out_csv, + ) assert os.path.exists(out_csv) df = pandas.read_csv(out_csv) assert len(df) == 2 keys = set(df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert keys == expected_keys print(df) # since we're just matching to identical sigs, the md5s should be the same - assert df['query_md5'][0] == df['match_md5'][0] - assert df['query_md5'][1] == df['match_md5'][1] + assert df["query_md5"][0] == df["match_md5"][0] + assert df["query_md5"][1] == df["match_md5"][1] def test_simple_dayhoff_indexed(runtmp): # test basic protein execution - sigs = get_test_data('dayhoff.zip') - - sigs_db = index_siglist(runtmp, sigs, runtmp.output('db'), ksize=19, moltype='dayhoff', scaled=100) - - out_csv = runtmp.output('out.csv') - runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs_db, - '-s', '100', '--moltype', 'dayhoff', '-k', '19', - '-o', out_csv) + sigs = get_test_data("dayhoff.zip") + + sigs_db = index_siglist( + runtmp, sigs, runtmp.output("db"), ksize=19, moltype="dayhoff", scaled=100 + ) + + out_csv = runtmp.output("out.csv") + runtmp.sourmash( + "scripts", + "fastmultigather", + sigs, + sigs_db, + "-s", + "100", + "--moltype", + "dayhoff", + "-k", + "19", + "-o", + out_csv, + ) assert os.path.exists(out_csv) df = pandas.read_csv(out_csv) assert len(df) == 2 keys = set(df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert keys == expected_keys print(df) # since we're just matching to identical sigs, the md5s should be the same - assert df['query_md5'][0] == df['match_md5'][0] - assert df['query_md5'][1] == df['match_md5'][1] + assert df["query_md5"][0] == df["match_md5"][0] + assert df["query_md5"][1] == df["match_md5"][1] def test_simple_hp_indexed(runtmp): # test basic protein execution - sigs = get_test_data('hp.zip') - - sigs_db = index_siglist(runtmp, sigs, runtmp.output('db'), ksize=19, moltype='hp', scaled=100) - - out_csv = runtmp.output('out.csv') - runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs_db, - '-s', '100', '--moltype', 'hp', '-k', '19', - '-o', out_csv) + sigs = get_test_data("hp.zip") + + sigs_db = index_siglist( + runtmp, sigs, runtmp.output("db"), ksize=19, moltype="hp", scaled=100 + ) + + out_csv = runtmp.output("out.csv") + runtmp.sourmash( + "scripts", + "fastmultigather", + sigs, + sigs_db, + "-s", + "100", + "--moltype", + "hp", + "-k", + "19", + "-o", + out_csv, + ) assert os.path.exists(out_csv) df = pandas.read_csv(out_csv) assert len(df) == 2 keys = set(df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert keys == expected_keys print(df) # since we're just matching to identical sigs, the md5s should be the same - assert df['query_md5'][0] == df['match_md5'][0] - assert df['query_md5'][1] == df['match_md5'][1] + assert df["query_md5"][0] == df["match_md5"][0] + assert df["query_md5"][1] == df["match_md5"][1] def test_indexed_full_output(runtmp): # test correct md5s present in output - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - g_output = runtmp.output('out.csv') - against_db = index_siglist(runtmp, against_list, runtmp.output('rocksdb')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_db, '-s', '100000', '-t', '0', - '-o', g_output) + g_output = runtmp.output("out.csv") + against_db = index_siglist(runtmp, against_list, runtmp.output("rocksdb")) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_db, + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + ) # check full gather output assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - expected_keys = {'match_name', 'query_filename', 'query_n_hashes', 'match_filename', 'f_match_orig', - 'query_bp', 'query_abundance', 'match_containment_ani', 'intersect_bp', 'total_weighted_hashes', - 'n_unique_weighted_found', 'query_name', 'gather_result_rank', 'moltype', - 'query_containment_ani', 'sum_weighted_found', 'f_orig_query', 'ksize', 'max_containment_ani', - 'std_abund', 'scaled', 'average_containment_ani', 'f_match', 'f_unique_to_query', - 'average_abund', 'unique_intersect_bp', 'median_abund', 'query_md5', 'match_md5', 'remaining_bp', - 'f_unique_weighted'} + expected_keys = { + "match_name", + "query_filename", + "query_n_hashes", + "match_filename", + "f_match_orig", + "query_bp", + "query_abundance", + "match_containment_ani", + "intersect_bp", + "total_weighted_hashes", + "n_unique_weighted_found", + "query_name", + "gather_result_rank", + "moltype", + "query_containment_ani", + "sum_weighted_found", + "f_orig_query", + "ksize", + "max_containment_ani", + "std_abund", + "scaled", + "average_containment_ani", + "f_match", + "f_unique_to_query", + "average_abund", + "unique_intersect_bp", + "median_abund", + "query_md5", + "match_md5", + "remaining_bp", + "f_unique_weighted", + } assert keys == expected_keys results = df.values.tolist() # check a few columns - average_ani = set(df['average_containment_ani']) + average_ani = set(df["average_containment_ani"]) avg_ani = set([round(x, 4) for x in average_ani]) - assert avg_ani == {0.9221, 0.9306, 0.9316} # @CTB check against py gather + assert avg_ani == {0.9221, 0.9306, 0.9316} # @CTB check against py gather - f_unique_weighted = set(df['f_unique_weighted']) + f_unique_weighted = set(df["f_unique_weighted"]) f_unique_weighted = set([round(x, 4) for x in f_unique_weighted]) assert f_unique_weighted == {0.0063, 0.002, 0.0062} - unique_intersect_bp = set(df['unique_intersect_bp']) - unique_intersect_bp = set([round(x,4) for x in unique_intersect_bp]) + unique_intersect_bp = set(df["unique_intersect_bp"]) + unique_intersect_bp = set([round(x, 4) for x in unique_intersect_bp]) assert unique_intersect_bp == {4400000, 1800000, 2200000} def test_nonindexed_full_vs_sourmash_gather(runtmp): - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") make_file_list(against_list, [sig2, sig47, sig63]) - g_output = runtmp.output('SRR606249.gather.csv') - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_list, '-s', '100000', '-t', '0') + g_output = runtmp.output("SRR606249.gather.csv") + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) assert os.path.exists(g_output) # now run sourmash gather - sg_output = runtmp.output('.csv') - runtmp.sourmash('gather', query, against_list, - '-o', sg_output, '--scaled', '100000') + sg_output = runtmp.output(".csv") + runtmp.sourmash( + "gather", query, against_list, "-o", sg_output, "--scaled", "100000" + ) gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) @@ -1034,162 +1661,208 @@ def test_nonindexed_full_vs_sourmash_gather(runtmp): sg_keys = set(sourmash_gather_df.keys()) print(sg_keys) modified_keys = ["match_md5", "match_name", "match_filename"] - sg_keys.update(modified_keys) # fastmultigather is more explicit (match_md5 instead of md5, etc) - print('g_keys - sg_keys:', g_keys - sg_keys) + sg_keys.update( + modified_keys + ) # fastmultigather is more explicit (match_md5 instead of md5, etc) + print("g_keys - sg_keys:", g_keys - sg_keys) assert not g_keys - sg_keys, g_keys - sg_keys for index, row in sourmash_gather_df.iterrows(): print(row.to_dict()) - fmg_intersect_bp = set(gather_df['intersect_bp']) - g_intersect_bp = set(sourmash_gather_df['intersect_bp']) + fmg_intersect_bp = set(gather_df["intersect_bp"]) + g_intersect_bp = set(sourmash_gather_df["intersect_bp"]) assert fmg_intersect_bp == g_intersect_bp == set([4400000, 4100000, 2200000]) - fmg_f_orig_query = set([round(x,4) for x in gather_df['f_orig_query']]) - g_f_orig_query = set([round(x,4) for x in sourmash_gather_df['f_orig_query']]) + fmg_f_orig_query = set([round(x, 4) for x in gather_df["f_orig_query"]]) + g_f_orig_query = set([round(x, 4) for x in sourmash_gather_df["f_orig_query"]]) assert fmg_f_orig_query == g_f_orig_query == set([0.0098, 0.0105, 0.0052]) - fmg_f_match = set([round(x,4) for x in gather_df['f_match']]) - g_f_match = set([round(x,4) for x in sourmash_gather_df['f_match']]) + fmg_f_match = set([round(x, 4) for x in gather_df["f_match"]]) + g_f_match = set([round(x, 4) for x in sourmash_gather_df["f_match"]]) assert fmg_f_match == g_f_match == set([0.439, 1.0]) - fmg_f_unique_to_query = set([round(x,3) for x in gather_df['f_unique_to_query']]) # rounding to 4 --> slightly different! - g_f_unique_to_query = set([round(x,3) for x in sourmash_gather_df['f_unique_to_query']]) + fmg_f_unique_to_query = set( + [round(x, 3) for x in gather_df["f_unique_to_query"]] + ) # rounding to 4 --> slightly different! + g_f_unique_to_query = set( + [round(x, 3) for x in sourmash_gather_df["f_unique_to_query"]] + ) assert fmg_f_unique_to_query == g_f_unique_to_query == set([0.004, 0.01, 0.005]) - fmg_f_unique_weighted = set([round(x,4) for x in gather_df['f_unique_weighted']]) - g_f_unique_weighted = set([round(x,4) for x in sourmash_gather_df['f_unique_weighted']]) - assert fmg_f_unique_weighted== g_f_unique_weighted == set([0.0063, 0.002, 0.0062]) - - fmg_average_abund = set([round(x,4) for x in gather_df['average_abund']]) - g_average_abund = set([round(x,4) for x in sourmash_gather_df['average_abund']]) - assert fmg_average_abund== g_average_abund == set([8.2222, 10.3864, 21.0455]) - - fmg_median_abund = set([round(x,4) for x in gather_df['median_abund']]) - g_median_abund = set([round(x,4) for x in sourmash_gather_df['median_abund']]) - assert fmg_median_abund== g_median_abund == set([8.0, 10.5, 21.5]) - - fmg_std_abund = set([round(x,4) for x in gather_df['std_abund']]) - g_std_abund = set([round(x,4) for x in sourmash_gather_df['std_abund']]) - assert fmg_std_abund== g_std_abund == set([3.172, 5.6446, 6.9322]) - - g_match_filename_basename = [os.path.basename(filename) for filename in sourmash_gather_df['filename']] - fmg_match_filename_basename = [os.path.basename(filename) for filename in gather_df['match_filename']] - assert all([x in fmg_match_filename_basename for x in ['2.fa.sig.gz', '63.fa.sig.gz', '47.fa.sig.gz']]) + fmg_f_unique_weighted = set([round(x, 4) for x in gather_df["f_unique_weighted"]]) + g_f_unique_weighted = set( + [round(x, 4) for x in sourmash_gather_df["f_unique_weighted"]] + ) + assert fmg_f_unique_weighted == g_f_unique_weighted == set([0.0063, 0.002, 0.0062]) + + fmg_average_abund = set([round(x, 4) for x in gather_df["average_abund"]]) + g_average_abund = set([round(x, 4) for x in sourmash_gather_df["average_abund"]]) + assert fmg_average_abund == g_average_abund == set([8.2222, 10.3864, 21.0455]) + + fmg_median_abund = set([round(x, 4) for x in gather_df["median_abund"]]) + g_median_abund = set([round(x, 4) for x in sourmash_gather_df["median_abund"]]) + assert fmg_median_abund == g_median_abund == set([8.0, 10.5, 21.5]) + + fmg_std_abund = set([round(x, 4) for x in gather_df["std_abund"]]) + g_std_abund = set([round(x, 4) for x in sourmash_gather_df["std_abund"]]) + assert fmg_std_abund == g_std_abund == set([3.172, 5.6446, 6.9322]) + + g_match_filename_basename = [ + os.path.basename(filename) for filename in sourmash_gather_df["filename"] + ] + fmg_match_filename_basename = [ + os.path.basename(filename) for filename in gather_df["match_filename"] + ] + assert all( + [ + x in fmg_match_filename_basename + for x in ["2.fa.sig.gz", "63.fa.sig.gz", "47.fa.sig.gz"] + ] + ) assert fmg_match_filename_basename == g_match_filename_basename - assert list(sourmash_gather_df['name']) == list(gather_df['match_name']) - assert list(sourmash_gather_df['md5']) == list(gather_df['match_md5']) + assert list(sourmash_gather_df["name"]) == list(gather_df["match_name"]) + assert list(sourmash_gather_df["md5"]) == list(gather_df["match_md5"]) - fmg_f_match_orig = set([round(x,4) for x in gather_df['f_match_orig']]) - g_f_match_orig = set([round(x,4) for x in sourmash_gather_df['f_match_orig']]) + fmg_f_match_orig = set([round(x, 4) for x in gather_df["f_match_orig"]]) + g_f_match_orig = set([round(x, 4) for x in sourmash_gather_df["f_match_orig"]]) assert fmg_f_match_orig == g_f_match_orig == set([1.0]) - fmg_unique_intersect_bp = set(gather_df['unique_intersect_bp']) - g_unique_intersect_bp = set(sourmash_gather_df['unique_intersect_bp']) - assert fmg_unique_intersect_bp == g_unique_intersect_bp == set([4400000, 1800000, 2200000]) + fmg_unique_intersect_bp = set(gather_df["unique_intersect_bp"]) + g_unique_intersect_bp = set(sourmash_gather_df["unique_intersect_bp"]) + assert ( + fmg_unique_intersect_bp + == g_unique_intersect_bp + == set([4400000, 1800000, 2200000]) + ) - fmg_gather_result_rank= set(gather_df['gather_result_rank']) - g_gather_result_rank = set(sourmash_gather_df['gather_result_rank']) - assert fmg_gather_result_rank == g_gather_result_rank == set([0,1,2]) + fmg_gather_result_rank = set(gather_df["gather_result_rank"]) + g_gather_result_rank = set(sourmash_gather_df["gather_result_rank"]) + assert fmg_gather_result_rank == g_gather_result_rank == set([0, 1, 2]) - fmg_remaining_bp = list(gather_df['remaining_bp']) + fmg_remaining_bp = list(gather_df["remaining_bp"]) assert fmg_remaining_bp == [415600000, 413400000, 411600000] ### Gather remaining bp does not match, but I think this one is right? - #g_remaining_bp = list(sourmash_gather_df['remaining_bp']) - #print("gather remaining bp: ", g_remaining_bp) #{4000000, 0, 1800000} + # g_remaining_bp = list(sourmash_gather_df['remaining_bp']) + # print("gather remaining bp: ", g_remaining_bp) #{4000000, 0, 1800000} # assert fmg_remaining_bp == g_remaining_bp == set([]) - fmg_query_containment_ani = set([round(x,4) for x in gather_df['query_containment_ani']]) - g_query_containment_ani = set([round(x,4) for x in sourmash_gather_df['query_containment_ani']]) + fmg_query_containment_ani = set( + [round(x, 4) for x in gather_df["query_containment_ani"]] + ) + g_query_containment_ani = set( + [round(x, 4) for x in sourmash_gather_df["query_containment_ani"]] + ) assert fmg_query_containment_ani == {0.8442, 0.8613, 0.8632} # gather cANI are nans here -- perhaps b/c sketches too small # assert fmg_query_containment_ani == g_query_containment_ani == set([0.8632, 0.8444, 0.8391]) print("fmg qcANI: ", fmg_query_containment_ani) print("g_qcANI: ", g_query_containment_ani) - fmg_n_unique_weighted_found= set(gather_df['n_unique_weighted_found']) - g_n_unique_weighted_found = set(sourmash_gather_df['n_unique_weighted_found']) - assert fmg_n_unique_weighted_found == g_n_unique_weighted_found == set([457, 148, 463]) + fmg_n_unique_weighted_found = set(gather_df["n_unique_weighted_found"]) + g_n_unique_weighted_found = set(sourmash_gather_df["n_unique_weighted_found"]) + assert ( + fmg_n_unique_weighted_found == g_n_unique_weighted_found == set([457, 148, 463]) + ) - fmg_sum_weighted_found= set(gather_df['sum_weighted_found']) - g_sum_weighted_found = set(sourmash_gather_df['sum_weighted_found']) + fmg_sum_weighted_found = set(gather_df["sum_weighted_found"]) + g_sum_weighted_found = set(sourmash_gather_df["sum_weighted_found"]) assert fmg_sum_weighted_found == g_sum_weighted_found == set([920, 457, 1068]) - fmg_total_weighted_hashes= set(gather_df['total_weighted_hashes']) - g_total_weighted_hashes = set(sourmash_gather_df['total_weighted_hashes']) + fmg_total_weighted_hashes = set(gather_df["total_weighted_hashes"]) + g_total_weighted_hashes = set(sourmash_gather_df["total_weighted_hashes"]) assert fmg_total_weighted_hashes == g_total_weighted_hashes == set([73489]) def test_rocksdb_gather_against_index_with_sigs(runtmp, capfd): # fastmultigather should succeed if indexed sigs are stored internally. - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - shutil.copyfile(sig2, runtmp.output('2.fa.sig.gz')) - shutil.copyfile(sig47, runtmp.output('47.fa.sig.gz')) - shutil.copyfile(sig63, runtmp.output('63.fa.sig.gz')) + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + shutil.copyfile(sig2, runtmp.output("2.fa.sig.gz")) + shutil.copyfile(sig47, runtmp.output("47.fa.sig.gz")) + shutil.copyfile(sig63, runtmp.output("63.fa.sig.gz")) - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') - make_file_list(against_list, ["2.fa.sig.gz", - "47.fa.sig.gz", - "63.fa.sig.gz"]) + against_list = runtmp.output("against.txt") + make_file_list(against_list, ["2.fa.sig.gz", "47.fa.sig.gz", "63.fa.sig.gz"]) # index! note: '--internal-storage' defaults to True - runtmp.sourmash('scripts', 'index', against_list, - '-o', 'subdir/against.rocksdb') + runtmp.sourmash("scripts", "index", against_list, "-o", "subdir/against.rocksdb") # remove the external storage out from under the rocksdb. - os.unlink(runtmp.output('2.fa.sig.gz')) - os.unlink(runtmp.output('47.fa.sig.gz')) - os.unlink(runtmp.output('63.fa.sig.gz')) - - g_output = runtmp.output('zzz.csv') - - runtmp.sourmash('scripts', 'fastmultigather', query_list, - 'subdir/against.rocksdb', '-s', '100000', '-t', '0', - '-o', g_output, - in_location=runtmp.output('')) + os.unlink(runtmp.output("2.fa.sig.gz")) + os.unlink(runtmp.output("47.fa.sig.gz")) + os.unlink(runtmp.output("63.fa.sig.gz")) + + g_output = runtmp.output("zzz.csv") + + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + "subdir/against.rocksdb", + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + in_location=runtmp.output(""), + ) def test_rocksdb_no_internal_storage_gather_fails(runtmp, capfd): # force gather to fail b/c we make an index with no internal sketches - query = get_test_data('SRR606249.sig.gz') + query = get_test_data("SRR606249.sig.gz") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - shutil.copyfile(sig2, runtmp.output('2.fa.sig.gz')) - shutil.copyfile(sig47, runtmp.output('47.fa.sig.gz')) - shutil.copyfile(sig63, runtmp.output('63.fa.sig.gz')) + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + shutil.copyfile(sig2, runtmp.output("2.fa.sig.gz")) + shutil.copyfile(sig47, runtmp.output("47.fa.sig.gz")) + shutil.copyfile(sig63, runtmp.output("63.fa.sig.gz")) - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) - against_list = runtmp.output('against.txt') - make_file_list(against_list, ["2.fa.sig.gz", - "47.fa.sig.gz", - "63.fa.sig.gz"]) - - runtmp.sourmash('scripts', 'index', against_list, '--no-internal-storage', - '-o', 'subdir/against.rocksdb') + against_list = runtmp.output("against.txt") + make_file_list(against_list, ["2.fa.sig.gz", "47.fa.sig.gz", "63.fa.sig.gz"]) + + runtmp.sourmash( + "scripts", + "index", + against_list, + "--no-internal-storage", + "-o", + "subdir/against.rocksdb", + ) # remove the external storage out from under the rocksdb. # this will make gather fail. - os.unlink(runtmp.output('2.fa.sig.gz')) - os.unlink(runtmp.output('47.fa.sig.gz')) - os.unlink(runtmp.output('63.fa.sig.gz')) + os.unlink(runtmp.output("2.fa.sig.gz")) + os.unlink(runtmp.output("47.fa.sig.gz")) + os.unlink(runtmp.output("63.fa.sig.gz")) - g_output = runtmp.output('zzz.csv') + g_output = runtmp.output("zzz.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, - 'subdir/against.rocksdb', '-s', '100000', '-t', '0', - '-o', g_output, - in_location=runtmp.output('')) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + "subdir/against.rocksdb", + "-s", + "100000", + "-t", + "0", + "-o", + g_output, + in_location=runtmp.output(""), + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -1199,51 +1872,76 @@ def test_rocksdb_no_internal_storage_gather_fails(runtmp, capfd): assert "Error gathering matches:" in captured.err assert "ERROR: 1 failed gathers. See error messages above." in captured.err - assert "Unresolvable errors found; results cannot be trusted. Quitting." in captured.err - + assert ( + "Unresolvable errors found; results cannot be trusted. Quitting." + in captured.err + ) def test_save_matches(runtmp): # test basic execution! - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0', '--save-matches', - in_directory=runtmp.output('')) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('SRR606249.gather.csv') - p_output = runtmp.output('SRR606249.prefetch.csv') - m_output = runtmp.output('SRR606249.matches.sig') + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + "--save-matches", + in_directory=runtmp.output(""), + ) + + print(os.listdir(runtmp.output(""))) + + g_output = runtmp.output("SRR606249.gather.csv") + p_output = runtmp.output("SRR606249.prefetch.csv") + m_output = runtmp.output("SRR606249.matches.sig") assert os.path.exists(g_output) assert os.path.exists(p_output) assert os.path.exists(m_output) - + # check prefetch output (only non-indexed gather) df = pandas.read_csv(p_output) - + assert len(df) == 3 keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + assert keys == { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + } assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys) + assert { + "query_filename", + "query_name", + "query_md5", + "match_name", + "match_md5", + "intersect_bp", + "gather_result_rank", + }.issubset(keys) # can't test against prefetch because matched k-mers can overlap match_ss = list(sourmash.load_file_as_signatures(m_output, ksize=31))[0] @@ -1251,7 +1949,7 @@ def test_save_matches(runtmp): matches_sig_len = len(match_mh) # right size? - assert sum(df['intersect_bp']) >= matches_sig_len * 100_000 + assert sum(df["intersect_bp"]) >= matches_sig_len * 100_000 # containment? mg_ss = list(sourmash.load_file_as_signatures(query, ksize=31))[0] @@ -1261,90 +1959,128 @@ def test_save_matches(runtmp): def test_create_empty_results(runtmp): # sig2 has 0 hashes in common with 47 and 63 - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [sig2]) make_file_list(against_list, [sig47, sig63]) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0', '--create-empty-results', in_directory=runtmp.output('')) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('CP001071.1.gather.csv') - p_output = runtmp.output('CP001071.1.prefetch.csv') + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-s", + "100000", + "-t", + "0", + "--create-empty-results", + in_directory=runtmp.output(""), + ) + + print(os.listdir(runtmp.output(""))) + + g_output = runtmp.output("CP001071.1.gather.csv") + p_output = runtmp.output("CP001071.1.prefetch.csv") assert os.path.exists(p_output) def test_simple_against_scaled(runtmp, zip_against): # we shouldn't automatically downsample query - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - - downsampled_sigs = runtmp.output('ds.sig.zip') - runtmp.sourmash('sig', 'downsample', - '--scaled', '120_000', - sig2, sig47, sig63, '-o', downsampled_sigs) - - query_list = runtmp.output('query.txt') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + + downsampled_sigs = runtmp.output("ds.sig.zip") + runtmp.sourmash( + "sig", + "downsample", + "--scaled", + "120_000", + sig2, + sig47, + sig63, + "-o", + downsampled_sigs, + ) + + query_list = runtmp.output("query.txt") make_file_list(query_list, [query]) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, - downsampled_sigs, - '-t', '0', in_directory=runtmp.output('')) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + downsampled_sigs, + "-t", + "0", + in_directory=runtmp.output(""), + ) def test_simple_query_scaled(runtmp): # test basic execution w/automatic scaled selection based on query - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-t', '0', in_directory=runtmp.output('')) + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-t", + "0", + in_directory=runtmp.output(""), + ) - print(os.listdir(runtmp.output(''))) + print(os.listdir(runtmp.output(""))) - g_output = runtmp.output('SRR606249.gather.csv') + g_output = runtmp.output("SRR606249.gather.csv") assert os.path.exists(g_output) def test_simple_query_scaled_indexed(runtmp): # test basic execution w/automatic scaled selection based on query # (on a rocksdb) - query = get_test_data('SRR606249.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + query = get_test_data("SRR606249.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - against_list = index_siglist(runtmp, against_list, - runtmp.output('against.rocksdb')) - - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-o', 'foo.csv', - '-t', '0', in_directory=runtmp.output('')) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('foo.csv') + against_list = index_siglist(runtmp, against_list, runtmp.output("against.rocksdb")) + + runtmp.sourmash( + "scripts", + "fastmultigather", + query_list, + against_list, + "-o", + "foo.csv", + "-t", + "0", + in_directory=runtmp.output(""), + ) + + print(os.listdir(runtmp.output(""))) + + g_output = runtmp.output("foo.csv") assert os.path.exists(g_output) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 105c1cb2..9e8a1d4a 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -5,75 +5,77 @@ import shutil from . import sourmash_tst_utils as utils -from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist) +from .sourmash_tst_utils import get_test_data, make_file_list, zip_siglist def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index') + runtmp.sourmash("scripts", "index") - assert 'usage: index' in runtmp.last_result.err + assert "usage: index" in runtmp.last_result.err def test_index(runtmp, toggle_internal_storage): # test basic index! - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(siglist, [sig2, sig47, sig63]) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', siglist, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", siglist, "-o", output, toggle_internal_storage) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err def test_index_warning_message(runtmp, capfd): # test basic index when it has to load things into memory - see #451. - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") # note: can't use zip w/o breaking index. See sourmash-bio/sourmash#3321. - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(siglist, [sig2, sig47, sig63]) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', siglist, '-o', output) + runtmp.sourmash("scripts", "index", siglist, "-o", output) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - assert "WARNING: loading all sketches into memory in order to index." in captured.err + assert ( + "WARNING: loading all sketches into memory in order to index." in captured.err + ) def test_index_error_message(runtmp, capfd): # test basic index when it errors out b/c can't load - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") # note: can't use zip w/o breaking index. See sourmash-bio/sourmash#3321. - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(siglist, [sig2, sig47, sig63]) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, '-o', output, - '--no-internal-storage') + runtmp.sourmash( + "scripts", "index", siglist, "-o", output, "--no-internal-storage" + ) captured = capfd.readouterr() print(captured.err) @@ -84,127 +86,160 @@ def test_index_recursive(runtmp, capfd): # test index of pathlist containing standalone manifest containing zip. # a little ridiculous, but should hit the various branches in # MultiCollection::load - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") # our basic list of sketches... - sig2_zip = get_test_data('2.sig.zip') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2_zip = get_test_data("2.sig.zip") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") # generate a standalone mf containing a sip - standalone_mf = runtmp.output('stand-mf.csv') - runtmp.sourmash('sig', 'collect', '-F', 'csv', '-o', standalone_mf, - sig2_zip) + standalone_mf = runtmp.output("stand-mf.csv") + runtmp.sourmash("sig", "collect", "-F", "csv", "-o", standalone_mf, sig2_zip) # now make a file list containing that mf make_file_list(siglist, [standalone_mf, sig47, sig63]) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', siglist, '-o', output) + runtmp.sourmash("scripts", "index", siglist, "-o", output) captured = capfd.readouterr() print(captured.err) - assert "WARNING: loading all sketches into memory in order to index." in captured.err - assert 'index is done' in runtmp.last_result.err - assert 'Indexing 3 sketches.' in captured.err + assert ( + "WARNING: loading all sketches into memory in order to index." in captured.err + ) + assert "index is done" in runtmp.last_result.err + assert "Indexing 3 sketches." in captured.err def test_index_protein(runtmp, toggle_internal_storage): - sigs = get_test_data('protein.zip') - output = runtmp.output('db.rocksdb') - - runtmp.sourmash('scripts', 'index', sigs, '-k', '19', '-s', '100', - '--moltype', 'protein', '-o', output, - toggle_internal_storage) + sigs = get_test_data("protein.zip") + output = runtmp.output("db.rocksdb") + + runtmp.sourmash( + "scripts", + "index", + sigs, + "-k", + "19", + "-s", + "100", + "--moltype", + "protein", + "-o", + output, + toggle_internal_storage, + ) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err def test_index_dayhoff(runtmp, toggle_internal_storage): - sigs = get_test_data('dayhoff.zip') - output = runtmp.output('db.rocksdb') - - runtmp.sourmash('scripts', 'index', sigs, '-k', '19', '-s', '100', - '--moltype', 'dayhoff', '-o', output, - toggle_internal_storage) + sigs = get_test_data("dayhoff.zip") + output = runtmp.output("db.rocksdb") + + runtmp.sourmash( + "scripts", + "index", + sigs, + "-k", + "19", + "-s", + "100", + "--moltype", + "dayhoff", + "-o", + output, + toggle_internal_storage, + ) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err def test_index_protein(runtmp, toggle_internal_storage): - sigs = get_test_data('hp.zip') - output = runtmp.output('db.rocksdb') - - runtmp.sourmash('scripts', 'index', sigs, '-k', '19', '-s', '100', - '--moltype', 'hp', '-o', output, toggle_internal_storage) + sigs = get_test_data("hp.zip") + output = runtmp.output("db.rocksdb") + + runtmp.sourmash( + "scripts", + "index", + sigs, + "-k", + "19", + "-s", + "100", + "--moltype", + "hp", + "-o", + output, + toggle_internal_storage, + ) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err def test_index_missing_siglist(runtmp, capfd, toggle_internal_storage): # test missing siglist file - siglist = runtmp.output('db-sigs.txt') - output = runtmp.output('out.db') + siglist = runtmp.output("db-sigs.txt") + output = runtmp.output("out.db") # make_file_list(siglist, []) # don't make siglist file with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, - '-o', output, toggle_internal_storage) + runtmp.sourmash( + "scripts", "index", siglist, "-o", output, toggle_internal_storage + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory: ' in captured.err + assert "Error: No such file or directory: " in captured.err def test_index_sig(runtmp, capfd, toggle_internal_storage): # test index with a .sig.gz file instead of pathlist # (should work now) - sig2 = get_test_data('2.fa.sig.gz') - output = runtmp.output('out.db') + sig2 = get_test_data("2.fa.sig.gz") + output = runtmp.output("out.db") - runtmp.sourmash('scripts', 'index', sig2, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", sig2, "-o", output, toggle_internal_storage) captured = capfd.readouterr() print(captured.err) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err def test_index_manifest(runtmp, capfd, toggle_internal_storage): # test index with a manifest file - sig2 = get_test_data('2.fa.sig.gz') - output = runtmp.output('out.db') - sig_mf = runtmp.output('mf.csv') + sig2 = get_test_data("2.fa.sig.gz") + output = runtmp.output("out.db") + sig_mf = runtmp.output("mf.csv") runtmp.sourmash("sig", "manifest", sig2, "-o", sig_mf) - runtmp.sourmash('scripts', 'index', sig_mf, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", sig_mf, "-o", output, toggle_internal_storage) captured = capfd.readouterr() print(captured.err) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err def test_index_bad_siglist_2(runtmp, capfd): # test with a bad siglist (containing a missing file) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, "no-exist"]) - db = runtmp.output('db.rocksdb') + db = runtmp.output("db.rocksdb") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', against_list, - '-o', db) + runtmp.sourmash("scripts", "index", against_list, "-o", db) captured = capfd.readouterr() print(captured.err) @@ -213,16 +248,15 @@ def test_index_bad_siglist_2(runtmp, capfd): def test_index_empty_siglist(runtmp, capfd): # test empty siglist file - siglist = runtmp.output('db-sigs.txt') - output = runtmp.output('out.db') - make_file_list(siglist, []) # empty + siglist = runtmp.output("db-sigs.txt") + output = runtmp.output("out.db") + make_file_list(siglist, []) # empty with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, - '-o', output) + runtmp.sourmash("scripts", "index", siglist, "-o", output) captured = capfd.readouterr() - assert not os.path.exists(output) # do we want an empty file, or no file? + assert not os.path.exists(output) # do we want an empty file, or no file? print(runtmp.last_result.out) print(runtmp.last_result.err) print(captured.err) @@ -231,15 +265,14 @@ def test_index_empty_siglist(runtmp, capfd): def test_index_nomatch(runtmp, capfd): # test index with a siglist file that has (only) a non-matching ksize sig - siglist = runtmp.output('against.txt') - db = runtmp.output('db.rocksdb') + siglist = runtmp.output("against.txt") + db = runtmp.output("db.rocksdb") - sig1 = get_test_data('1.fa.k21.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") make_file_list(siglist, [sig1]) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, - '-o', db) + runtmp.sourmash("scripts", "index", siglist, "-o", db) captured = capfd.readouterr() print(runtmp.last_result.out) @@ -251,15 +284,14 @@ def test_index_nomatch(runtmp, capfd): def test_index_nomatch_sig_in_siglist(runtmp, capfd): # test index with a siglist file that has both matching and non-matching sigs - siglist = runtmp.output('against.txt') - db = runtmp.output('db.rocksdb') + siglist = runtmp.output("against.txt") + db = runtmp.output("db.rocksdb") - sig2 = get_test_data('2.fa.sig.gz') - sig1 = get_test_data('1.fa.k21.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig1 = get_test_data("1.fa.k21.sig.gz") make_file_list(siglist, [sig2, sig1]) - runtmp.sourmash('scripts', 'index', siglist, - '-o', db) + runtmp.sourmash("scripts", "index", siglist, "-o", db) captured = capfd.readouterr() print(runtmp.last_result.out) @@ -271,26 +303,25 @@ def test_index_nomatch_sig_in_siglist(runtmp, capfd): def test_index_zipfile(runtmp, capfd, toggle_internal_storage): # test basic index from sourmash zipfile - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(siglist, [sig2, sig47, sig63]) - zipf = runtmp.output('sigs.zip') + zipf = runtmp.output("sigs.zip") - runtmp.sourmash('sig', 'cat', siglist, '-o', zipf) + runtmp.sourmash("sig", "cat", siglist, "-o", zipf) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', zipf, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", zipf, "-o", output, toggle_internal_storage) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) @@ -303,113 +334,119 @@ def test_index_zipfile_subdir(runtmp, capfd, toggle_internal_storage): # * use non-abspath to zip file for indexing # so that the relative path gets things wrong. - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - shutil.copyfile(sig2, runtmp.output('2.fa.sig.gz')) - shutil.copyfile(sig47, runtmp.output('47.fa.sig.gz')) - shutil.copyfile(sig63, runtmp.output('63.fa.sig.gz')) + shutil.copyfile(sig2, runtmp.output("2.fa.sig.gz")) + shutil.copyfile(sig47, runtmp.output("47.fa.sig.gz")) + shutil.copyfile(sig63, runtmp.output("63.fa.sig.gz")) - os.mkdir(runtmp.output('subdir')) + os.mkdir(runtmp.output("subdir")) - zipf = 'sigs.zip' + zipf = "sigs.zip" - runtmp.sourmash('sig', 'cat', '2.fa.sig.gz', '47.fa.sig.gz', - '63.fa.sig.gz', '-o', zipf) + runtmp.sourmash( + "sig", "cat", "2.fa.sig.gz", "47.fa.sig.gz", "63.fa.sig.gz", "-o", zipf + ) - output = runtmp.output('subdir/db.rocksdb') + output = runtmp.output("subdir/db.rocksdb") - runtmp.sourmash('scripts', 'index', zipf, - '-o', output, in_directory=runtmp.output(''), - toggle_internal_storage=toggle_internal_storage) + runtmp.sourmash( + "scripts", + "index", + zipf, + "-o", + output, + in_directory=runtmp.output(""), + toggle_internal_storage=toggle_internal_storage, + ) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - runtmp.sourmash('scripts', 'check', 'db.rocksdb', - in_directory=runtmp.output('subdir')) - runtmp.sourmash('scripts', 'check', 'subdir/db.rocksdb', - in_directory=runtmp.output('')) + runtmp.sourmash( + "scripts", "check", "db.rocksdb", in_directory=runtmp.output("subdir") + ) + runtmp.sourmash( + "scripts", "check", "subdir/db.rocksdb", in_directory=runtmp.output("") + ) def test_index_zipfile_repeated_md5sums(runtmp, capfd, toggle_internal_storage): # test that we're reading all files, including repeated md5sums - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig2a = runtmp.output('sig2a.sig.gz') - sig2b = runtmp.output('sig2b.sig.gz') - runtmp.sourmash('sig', 'rename', sig2, 'name2', '-o', sig2a) - runtmp.sourmash('sig', 'rename', sig2, 'name3', '-o', sig2b) + sig2 = get_test_data("2.fa.sig.gz") + sig2a = runtmp.output("sig2a.sig.gz") + sig2b = runtmp.output("sig2b.sig.gz") + runtmp.sourmash("sig", "rename", sig2, "name2", "-o", sig2a) + runtmp.sourmash("sig", "rename", sig2, "name3", "-o", sig2b) make_file_list(siglist, [sig2, sig2a, sig2b]) - zipf = runtmp.output('sigs.zip') - runtmp.sourmash('sig', 'cat', siglist, '-o', zipf) + zipf = runtmp.output("sigs.zip") + runtmp.sourmash("sig", "cat", siglist, "-o", zipf) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', zipf, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", zipf, "-o", output, toggle_internal_storage) assert os.path.exists(output) print(runtmp.last_result.err) captured = capfd.readouterr() print(captured.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err def test_index_zipfile_multiparam(runtmp, capfd, toggle_internal_storage): # test index from sourmash zipfile with multiple ksizes / scaled /moltype - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - sig1 = get_test_data('1.combined.sig.gz') - srr = get_test_data('SRR606249.sig.gz') - prot = get_test_data('protein.zip') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + sig1 = get_test_data("1.combined.sig.gz") + srr = get_test_data("SRR606249.sig.gz") + prot = get_test_data("protein.zip") make_file_list(siglist, [sig2, sig47, sig63, sig1, srr, prot]) - zipf = runtmp.output('sigs.zip') + zipf = runtmp.output("sigs.zip") - runtmp.sourmash('sig', 'cat', siglist, '-o', zipf) + runtmp.sourmash("sig", "cat", siglist, "-o", zipf) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', zipf, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", zipf, "-o", output, toggle_internal_storage) assert os.path.exists(output) print(runtmp.last_result.err) - assert 'index is done' in runtmp.last_result.err + assert "index is done" in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) def test_index_zipfile_bad(runtmp, capfd): # test with a bad input zipfile (a .sig.gz file renamed as zip file) - sig2 = get_test_data('2.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") - query_zip = runtmp.output('query.zip') + query_zip = runtmp.output("query.zip") # cp sig2 into query_zip - with open(query_zip, 'wb') as fp: - with open(sig2, 'rb') as fp2: + with open(query_zip, "wb") as fp: + with open(sig2, "rb") as fp2: fp.write(fp2.read()) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', query_zip, - '-o', output) + runtmp.sourmash("scripts", "index", query_zip, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -419,60 +456,57 @@ def test_index_zipfile_bad(runtmp, capfd): def test_index_check(runtmp, toggle_internal_storage): # test check index - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") make_file_list(siglist, [sig2, sig47]) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', siglist, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", siglist, "-o", output, toggle_internal_storage) - runtmp.sourmash('scripts', 'check', output) + runtmp.sourmash("scripts", "check", output) print(runtmp.last_result.err) - assert 'index is ok' in runtmp.last_result.err + assert "index is ok" in runtmp.last_result.err def test_index_check_quick(runtmp, toggle_internal_storage): # test check index - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") make_file_list(siglist, [sig2, sig47]) - output = runtmp.output('db.rocksdb') + output = runtmp.output("db.rocksdb") - runtmp.sourmash('scripts', 'index', siglist, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", siglist, "-o", output, toggle_internal_storage) - runtmp.sourmash('scripts', 'check', '--quick', output) + runtmp.sourmash("scripts", "check", "--quick", output) print(runtmp.last_result.err) - assert 'index is ok' in runtmp.last_result.err + assert "index is ok" in runtmp.last_result.err def test_index_subdir(runtmp, toggle_internal_storage): # test basic index & output to subdir - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(siglist, [sig2, sig47, sig63]) - os.mkdir(runtmp.output('subdir')) - output = runtmp.output('subdir/db.rocksdb') + os.mkdir(runtmp.output("subdir")) + output = runtmp.output("subdir/db.rocksdb") - runtmp.sourmash('scripts', 'index', siglist, - '-o', output, toggle_internal_storage) + runtmp.sourmash("scripts", "index", siglist, "-o", output, toggle_internal_storage) assert os.path.exists(output) print(runtmp.last_result.err) - runtmp.sourmash('scripts', 'check', output) + runtmp.sourmash("scripts", "check", output) diff --git a/src/python/tests/test_manysearch.py b/src/python/tests/test_manysearch.py index 4750d9d6..f56708c7 100644 --- a/src/python/tests/test_manysearch.py +++ b/src/python/tests/test_manysearch.py @@ -4,70 +4,75 @@ import sourmash from . import sourmash_tst_utils as utils -from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist, - index_siglist) +from .sourmash_tst_utils import ( + get_test_data, + make_file_list, + zip_siglist, + index_siglist, +) def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch') + runtmp.sourmash("scripts", "manysearch") - assert 'usage: manysearch' in runtmp.last_result.err + assert "usage: manysearch" in runtmp.last_result.err def test_simple(runtmp, zip_query, zip_against): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output, '-t', '0.01') + runtmp.sourmash( + "scripts", "manysearch", query_list, against_list, "-o", output, "-t", "0.01" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) - assert float(row['match_containment_ani'] == 1.0) - assert float(row['average_containment_ani'] == 1.0) - assert float(row['max_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - query_ani = float(row['query_containment_ani']) - match_ani = float(row['match_containment_ani']) - average_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) + match_ani = float(row["match_containment_ani"]) + average_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) maxcont = round(maxcont, 4) @@ -75,10 +80,19 @@ def test_simple(runtmp, zip_query, zip_against): match_ani = round(match_ani, 4) average_ani = round(average_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", - f"{query_ani:.04}", f"{match_ani:.04}", f"{average_ani:.04}", f"{max_ani:.04}") - - if q == 'NC_011665.1' and m == 'NC_009661.1': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + f"{query_ani:.04}", + f"{match_ani:.04}", + f"{average_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "NC_011665.1" and m == "NC_009661.1": assert jaccard == 0.3207 assert cont == 0.4828 assert maxcont == 0.4885 @@ -88,7 +102,7 @@ def test_simple(runtmp, zip_query, zip_against): assert average_ani == 0.977 assert max_ani == 0.9772 - if q == 'NC_009661.1' and m == 'NC_011665.1': + if q == "NC_009661.1" and m == "NC_011665.1": assert jaccard == 0.3207 assert cont == 0.4885 assert maxcont == 0.4885 @@ -101,38 +115,49 @@ def test_simple(runtmp, zip_query, zip_against): def test_simple_abund(runtmp): # test with abund sig - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - query_list = runtmp.output('query.txt') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + query_list = runtmp.output("query.txt") make_file_list(query_list, [sig2, sig47, sig63]) - against = get_test_data('SRR606249.sig.gz') - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'manysearch', query_list, against, - '-o', output, '--scaled', '100000', '-k', '31', - '-t', '0.01') + against = get_test_data("SRR606249.sig.gz") + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "manysearch", + query_list, + against, + "-o", + output, + "--scaled", + "100000", + "-k", + "31", + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 3 - dd = df.to_dict(orient='index') - dd = list(sorted(dd.values(), key=lambda x: x['query_name'])) + dd = df.to_dict(orient="index") + dd = list(sorted(dd.values(), key=lambda x: x["query_name"])) print(dd) row = dd[0] - query_name = row['query_name'].split()[0] - average_abund = round(float(row['average_abund']), 4) - median_abund = round(float(row['median_abund']), 4) - std_abund = round(float(row['std_abund']), 4) - n_weighted_found = int(row['n_weighted_found']) - total_weighted_hashes = int(row['total_weighted_hashes']) - - assert query_name == 'CP001071.1' + query_name = row["query_name"].split()[0] + average_abund = round(float(row["average_abund"]), 4) + median_abund = round(float(row["median_abund"]), 4) + std_abund = round(float(row["std_abund"]), 4) + n_weighted_found = int(row["n_weighted_found"]) + total_weighted_hashes = int(row["total_weighted_hashes"]) + + assert query_name == "CP001071.1" assert average_abund == round(21.045454545454500, 4) assert median_abund == 21.5 assert std_abund == round(5.644605411181010, 4) @@ -140,14 +165,14 @@ def test_simple_abund(runtmp): assert total_weighted_hashes == 73489 row = dd[1] - query_name = row['query_name'].split()[0] - average_abund = round(float(row['average_abund']), 4) - median_abund = round(float(row['median_abund']), 4) - std_abund = round(float(row['std_abund']), 4) - n_weighted_found = int(row['n_weighted_found']) - total_weighted_hashes = int(row['total_weighted_hashes']) - - assert query_name == 'NC_009661.1' + query_name = row["query_name"].split()[0] + average_abund = round(float(row["average_abund"]), 4) + median_abund = round(float(row["median_abund"]), 4) + std_abund = round(float(row["std_abund"]), 4) + n_weighted_found = int(row["n_weighted_found"]) + total_weighted_hashes = int(row["total_weighted_hashes"]) + + assert query_name == "NC_009661.1" assert average_abund == round(11.365853658536600, 4) assert median_abund == 11.0 assert std_abund == round(4.976805212676670, 4) @@ -155,14 +180,14 @@ def test_simple_abund(runtmp): assert total_weighted_hashes == 73489 row = dd[2] - query_name = row['query_name'].split()[0] - average_abund = round(float(row['average_abund']), 4) - median_abund = round(float(row['median_abund']), 4) - std_abund = round(float(row['std_abund']), 4) - n_weighted_found = int(row['n_weighted_found']) - total_weighted_hashes = int(row['total_weighted_hashes']) - - assert query_name == 'NC_011665.1' + query_name = row["query_name"].split()[0] + average_abund = round(float(row["average_abund"]), 4) + median_abund = round(float(row["median_abund"]), 4) + std_abund = round(float(row["std_abund"]), 4) + n_weighted_found = int(row["n_weighted_found"]) + total_weighted_hashes = int(row["total_weighted_hashes"]) + + assert query_name == "NC_011665.1" assert average_abund == round(10.386363636363600, 4) assert median_abund == 10.5 assert std_abund == round(6.932190750047300, 4) @@ -172,59 +197,60 @@ def test_simple_abund(runtmp): def test_simple_indexed(runtmp, zip_query, indexed_query): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if indexed_query: - query_list = index_siglist(runtmp, query_list, runtmp.output('query_db')) + query_list = index_siglist(runtmp, query_list, runtmp.output("query_db")) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - print('query_list is:', query_list) - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output, '-t', '0.01') + print("query_list is:", query_list) + runtmp.sourmash( + "scripts", "manysearch", query_list, against_list, "-o", output, "-t", "0.01" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert float(row['containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert float(row["containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - intersect_hashes = int(row['intersect_hashes']) - query_ani = float(row['query_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) cont = round(cont, 4) query_ani = round(query_ani, 4) print(q, m, f"{cont:.04}", f"{query_ani:.04}") - if q == 'NC_011665.1' and m == 'NC_009661.1': + if q == "NC_011665.1" and m == "NC_009661.1": assert cont == 0.4828 assert intersect_hashes == 2529 assert query_ani == 0.9768 - if q == 'NC_009661.1' and m == 'NC_011665.1': + if q == "NC_009661.1" and m == "NC_011665.1": assert cont == 0.4885 assert intersect_hashes == 2529 assert query_ani == 0.9772 @@ -232,50 +258,51 @@ def test_simple_indexed(runtmp, zip_query, indexed_query): def test_simple_list_of_zips(runtmp): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.sig.zip') - sig63 = get_test_data('63.sig.zip') + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.sig.zip") + sig63 = get_test_data("63.sig.zip") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output, '-t', '0.01') + runtmp.sourmash( + "scripts", "manysearch", query_list, against_list, "-o", output, "-t", "0.01" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert float(row['containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert float(row["containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - intersect_hashes = int(row['intersect_hashes']) - query_ani = float(row['query_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) cont = round(cont, 4) query_ani = round(query_ani, 4) print(q, m, f"{cont:.04}", f"{query_ani:.04}") - if q == 'NC_011665.1' and m == 'NC_009661.1': + if q == "NC_011665.1" and m == "NC_009661.1": assert cont == 0.4828 assert intersect_hashes == 2529 assert query_ani == 0.9768 - if q == 'NC_009661.1' and m == 'NC_011665.1': + if q == "NC_009661.1" and m == "NC_011665.1": assert cont == 0.4885 assert intersect_hashes == 2529 assert query_ani == 0.9772 @@ -283,26 +310,36 @@ def test_simple_list_of_zips(runtmp): def test_simple_with_cores(runtmp, capfd, indexed, zip_query): # test basic execution with -c argument (that it runs, at least!) - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output, '-c', '4', '-t', '0.01') + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "manysearch", + query_list, + against_list, + "-o", + output, + "-c", + "4", + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) @@ -315,26 +352,27 @@ def test_simple_with_cores(runtmp, capfd, indexed, zip_query): def test_simple_threshold(runtmp, indexed, zip_query): # test with a simple threshold => only 3 results - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output, '-t', '0.5') + runtmp.sourmash( + "scripts", "manysearch", query_list, against_list, "-o", output, "-t", "0.5" + ) assert os.path.exists(output) df = pandas.read_csv(output) @@ -343,12 +381,12 @@ def test_simple_threshold(runtmp, indexed, zip_query): def test_simple_manifest(runtmp, indexed): # test with a simple threshold => only 3 results - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) @@ -360,14 +398,15 @@ def test_simple_manifest(runtmp, indexed): runtmp.sourmash("sig", "manifest", against_list, "-o", against_mf) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) else: against_list = against_mf - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'manysearch', query_mf, against_list, - '-o', output, '-t', '0.5') + runtmp.sourmash( + "scripts", "manysearch", query_mf, against_list, "-o", output, "-t", "0.5" + ) assert os.path.exists(output) df = pandas.read_csv(output) @@ -376,148 +415,145 @@ def test_simple_manifest(runtmp, indexed): def test_missing_query(runtmp, capfd, indexed, zip_query): # test with a missing query list - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - #make_file_list(query_list, [sig2, sig47, sig63]) # don't make query + # make_file_list(query_list, [sig2, sig47, sig63]) # don't make query make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) if zip_query: - query_list = runtmp.output('query.zip') + query_list = runtmp.output("query.zip") - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_sig_query(runtmp, capfd, indexed): # test with a single sig query (a .sig.gz file) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'manysearch', sig2, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", sig2, against_list, "-o", output) def test_bad_query_2(runtmp, capfd, indexed): # test with a bad query list (a missing file) - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, "no-exist"]) make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - output = runtmp.output('out.csv') + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 query paths failed to load. See error messages above." + in captured.err + ) def test_bad_query_3(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_zip = runtmp.output('query.zip') + query_zip = runtmp.output("query.zip") # cp sig2 into query_zip - with open(query_zip, 'wb') as fp: - with open(sig2, 'rb') as fp2: + with open(query_zip, "wb") as fp: + with open(sig2, "rb") as fp2: fp.write(fp2.read()) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_zip, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_zip, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) - assert 'InvalidArchive' in captured.err + assert "InvalidArchive" in captured.err def test_missing_against(runtmp, capfd, indexed): # test with a missing against list - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) # do not create against_list - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_nomatch_against(runtmp, capfd): # nonmatching against file (num sig) - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - nomatch_sketch = get_test_data('SRR606249.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + nomatch_sketch = get_test_data("SRR606249.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [nomatch_sketch]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) captured = capfd.readouterr() assert "No search signatures loaded, exiting." in captured.err @@ -525,47 +561,48 @@ def test_nomatch_against(runtmp, capfd): def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, "no-exist"]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 search paths failed to load. See error messages above." + in captured.err + ) def test_empty_query(runtmp, indexed, capfd): # test with an empty query list - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, []) make_file_list(against_list, [sig2, sig47, sig63]) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) print(runtmp.last_result.err) captured = capfd.readouterr() @@ -575,91 +612,88 @@ def test_empty_query(runtmp, indexed, capfd): def test_nomatch_query(runtmp, capfd, indexed, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1 = get_test_data('1.fa.k21.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63, sig1]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 query paths - no compatible signatures.' in captured.err + assert "WARNING: skipped 1 query paths - no compatible signatures." in captured.err def test_load_only_one_bug(runtmp, capfd, indexed, zip_against): # check that we behave properly when presented with multiple against # sketches - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1_k31 = get_test_data('1.fa.k31.sig.gz') + sig1_k31 = get_test_data("1.fa.k31.sig.gz") # note: this was created as a 3-sketch-in-one-signature directly # via sourmash sketch dna -p k=21,k=31,k=51. - sig1_all = get_test_data('1.combined.sig.gz') + sig1_all = get_test_data("1.combined.sig.gz") make_file_list(query_list, [sig1_k31]) make_file_list(against_list, [sig1_all]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_against: - against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) elif indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) - assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path ' in captured.err + assert not "WARNING: skipped 1 paths - no compatible signatures." in captured.err + assert not "WARNING: no compatible sketches in path " in captured.err def test_load_only_one_bug_as_query(runtmp, capfd, indexed, zip_query): # check that we behave properly when presented with multiple query # sketches in one file, with only one matching. - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1_k31 = get_test_data('1.fa.k31.sig.gz') + sig1_k31 = get_test_data("1.fa.k31.sig.gz") # note: this was created as a 3-sketch-in-one-signature directly # via sourmash sketch dna -p k=21,k=31,k=51. - sig1_all = get_test_data('1.combined.sig.gz') + sig1_all = get_test_data("1.combined.sig.gz") make_file_list(query_list, [sig1_all]) make_file_list(against_list, [sig1_k31]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "manysearch", query_list, against_list, "-o", output) assert os.path.exists(output) @@ -667,46 +701,47 @@ def test_load_only_one_bug_as_query(runtmp, capfd, indexed, zip_query): print(captured.err) print(runtmp.last_result.out) - assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path ' in captured.err + assert not "WARNING: skipped 1 paths - no compatible signatures." in captured.err + assert not "WARNING: no compatible sketches in path " in captured.err def test_md5(runtmp, indexed, zip_query): # test that md5s match what was in the original files, not downsampled etc. - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') - + output = runtmp.output("out.csv") + if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, - '-o', output, '-t', '0.01') + runtmp.sourmash( + "scripts", "manysearch", query_list, against_list, "-o", output, "-t", "0.01" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - md5s = list(df['query_md5']) + md5s = list(df["query_md5"]) print(md5s) for query_file in (sig2, sig47, sig63): for ss in sourmash.load_file_as_signatures(query_file, ksize=31): assert ss.md5sum() in md5s - if not indexed: # indexed search cannot produce match_md5 - md5s = list(df['match_md5']) + if not indexed: # indexed search cannot produce match_md5 + md5s = list(df["match_md5"]) print(md5s) for against_file in (sig2, sig47, sig63): @@ -716,45 +751,58 @@ def test_md5(runtmp, indexed, zip_query): def test_simple_protein(runtmp): # test basic execution with proteins - protsigs = get_test_data('protein.zip') - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'manysearch', protsigs, protsigs, - '-k', '19', '-s', '100', '--moltype', 'protein', - '-o', output, '-t', '0.01') + protsigs = get_test_data("protein.zip") + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "manysearch", + protsigs, + protsigs, + "-k", + "19", + "-s", + "100", + "--moltype", + "protein", + "-o", + output, + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): print(row) # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani']) == 1.0 - assert float(row['match_containment_ani']) == 1.0 - assert float(row['average_containment_ani']) == 1.0 - assert float(row['max_containment_ani']) == 1.0 + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"]) == 1.0 + assert float(row["match_containment_ani"]) == 1.0 + assert float(row["average_containment_ani"]) == 1.0 + assert float(row["max_containment_ani"]) == 1.0 else: - # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - query_ani = float(row['query_containment_ani']) - match_ani = float(row['match_containment_ani']) - average_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) + match_ani = float(row["match_containment_ani"]) + average_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -763,73 +811,98 @@ def test_simple_protein(runtmp): match_ani = round(match_ani, 4) average_ani = round(average_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{query_ani:.04}", f"{match_ani:.04}", f"{average_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{query_ani:.04}", + f"{match_ani:.04}", + f"{average_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.0434 assert cont == 0.1003 assert maxcont == 0.1003 assert intersect_hashes == 342 - assert query_ani == 0.9605 + assert query_ani == 0.9605 assert match_ani == 0.9547 assert average_ani == 0.9576 assert max_ani == 0.9605 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert jaccard == 0.0434 assert cont == 0.0712 assert maxcont == 0.1003 assert intersect_hashes == 342 - assert query_ani == 0.9547 - assert match_ani == 0.9605 + assert query_ani == 0.9547 + assert match_ani == 0.9605 assert average_ani == 0.9576 assert max_ani == 0.9605 def test_simple_protein_indexed(runtmp): # test basic execution with proteins - protsigs = get_test_data('protein.zip') - output = runtmp.output('out.csv') - - protsigs_db = index_siglist(runtmp, protsigs, runtmp.output('db'), - ksize=19, moltype='protein', scaled=100) - - runtmp.sourmash('scripts', 'manysearch', protsigs, protsigs_db, - '-k', '19', '-s', '100', '--moltype', 'protein', - '-o', output, '-t', '0.01') + protsigs = get_test_data("protein.zip") + output = runtmp.output("out.csv") + + protsigs_db = index_siglist( + runtmp, protsigs, runtmp.output("db"), ksize=19, moltype="protein", scaled=100 + ) + + runtmp.sourmash( + "scripts", + "manysearch", + protsigs, + protsigs_db, + "-k", + "19", + "-s", + "100", + "--moltype", + "protein", + "-o", + output, + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): print(row) # identical? - if row['match_name'] == row['query_name']: - assert float(row['containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert float(row["containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) else: - # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - query_ani = float(row['query_containment_ani']) - intersect_hashes = int(row['intersect_hashes']) + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + query_ani = float(row["query_containment_ani"]) + intersect_hashes = int(row["intersect_hashes"]) cont = round(cont, 4) query_ani = round(query_ani, 4) print(q, m, f"{cont:.04}", intersect_hashes, f"{query_ani:.04}") - if q == 'GCA_001593925' and m == 'GCA_001593935': + if q == "GCA_001593925" and m == "GCA_001593935": assert cont == 0.1003 assert intersect_hashes == 342 assert query_ani == 0.9605 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert cont == 0.0712 assert intersect_hashes == 342 assert query_ani == 0.9547 @@ -837,46 +910,59 @@ def test_simple_protein_indexed(runtmp): def test_simple_dayhoff(runtmp): # test basic execution with dayhoff - protsigs = get_test_data('dayhoff.zip') - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'manysearch', protsigs, protsigs, - '-k', '19', '-s', '100', '--moltype', 'dayhoff', - '-o', output, '-t', '0.01') + protsigs = get_test_data("dayhoff.zip") + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "manysearch", + protsigs, + protsigs, + "-k", + "19", + "-s", + "100", + "--moltype", + "dayhoff", + "-o", + output, + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): print(row) # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani']) == 1.0 - assert float(row['match_containment_ani']) == 1.0 - assert float(row['average_containment_ani']) == 1.0 - assert float(row['max_containment_ani']) == 1.0 - + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"]) == 1.0 + assert float(row["match_containment_ani"]) == 1.0 + assert float(row["average_containment_ani"]) == 1.0 + assert float(row["max_containment_ani"]) == 1.0 + else: - # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - query_ani = float(row['query_containment_ani']) - match_ani = float(row['match_containment_ani']) - average_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) + match_ani = float(row["match_containment_ani"]) + average_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -885,9 +971,20 @@ def test_simple_dayhoff(runtmp): match_ani = round(match_ani, 4) average_ani = round(average_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{query_ani:.04}", f"{match_ani:.04}", f"{average_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{query_ani:.04}", + f"{match_ani:.04}", + f"{average_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.1326 assert cont == 0.2815 assert maxcont == 0.2815 @@ -897,7 +994,7 @@ def test_simple_dayhoff(runtmp): assert average_ani == 0.9751 assert max_ani == 0.978 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert jaccard == 0.1326 assert cont == 0.2004 assert maxcont == 0.2815 @@ -910,48 +1007,62 @@ def test_simple_dayhoff(runtmp): def test_simple_dayhoff_indexed(runtmp): # test indexed execution with dayhoff - protsigs = get_test_data('dayhoff.zip') - output = runtmp.output('out.csv') - - protsigs_db = index_siglist(runtmp, protsigs, runtmp.output('db'), - ksize=19, moltype='dayhoff', scaled=100) - - runtmp.sourmash('scripts', 'manysearch', protsigs, protsigs_db, - '-k', '19', '-s', '100', '--moltype', 'dayhoff', - '-o', output, '-t', '0.01') + protsigs = get_test_data("dayhoff.zip") + output = runtmp.output("out.csv") + + protsigs_db = index_siglist( + runtmp, protsigs, runtmp.output("db"), ksize=19, moltype="dayhoff", scaled=100 + ) + + runtmp.sourmash( + "scripts", + "manysearch", + protsigs, + protsigs_db, + "-k", + "19", + "-s", + "100", + "--moltype", + "dayhoff", + "-o", + output, + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): print(row) # identical? - if row['match_name'] == row['query_name']: - assert float(row['containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert float(row["containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) else: - # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - query_ani = float(row['query_containment_ani']) - intersect_hashes = int(row['intersect_hashes']) + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + query_ani = float(row["query_containment_ani"]) + intersect_hashes = int(row["intersect_hashes"]) cont = round(cont, 4) query_ani = round(query_ani, 4) print(q, m, f"{cont:.04}", intersect_hashes, f"{query_ani:.04}") - if q == 'GCA_001593925' and m == 'GCA_001593935': + if q == "GCA_001593925" and m == "GCA_001593935": assert cont == 0.2815 assert intersect_hashes == 930 assert query_ani == 0.978 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert cont == 0.2004 assert intersect_hashes == 930 assert query_ani == 0.9722 @@ -959,45 +1070,58 @@ def test_simple_dayhoff_indexed(runtmp): def test_simple_hp(runtmp): # test basic execution with hp - protsigs = get_test_data('hp.zip') - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'manysearch', protsigs, protsigs, - '-k', '19', '-s', '100', '--moltype', 'hp', - '-o', output, '-t', '0.01') + protsigs = get_test_data("hp.zip") + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "manysearch", + protsigs, + protsigs, + "-k", + "19", + "-s", + "100", + "--moltype", + "hp", + "-o", + output, + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): print(row) # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani']) == 1.0 - assert float(row['match_containment_ani']) == 1.0 - assert float(row['average_containment_ani']) == 1.0 - assert float(row['max_containment_ani']) == 1.0 + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"]) == 1.0 + assert float(row["match_containment_ani"]) == 1.0 + assert float(row["average_containment_ani"]) == 1.0 + assert float(row["max_containment_ani"]) == 1.0 else: - # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - query_ani = float(row['query_containment_ani']) - match_ani = float(row['match_containment_ani']) - average_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) + match_ani = float(row["match_containment_ani"]) + average_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -1006,9 +1130,20 @@ def test_simple_hp(runtmp): match_ani = round(match_ani, 4) average_ani = round(average_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{query_ani:.04}", f"{match_ani:.04}", f"{average_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{query_ani:.04}", + f"{match_ani:.04}", + f"{average_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.4983 assert cont == 0.747 assert maxcont == 0.747 @@ -1018,7 +1153,7 @@ def test_simple_hp(runtmp): assert average_ani == 0.993 assert max_ani == 0.9949 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert jaccard == 0.4983 assert cont == 0.5994 assert maxcont == 0.747 @@ -1031,49 +1166,63 @@ def test_simple_hp(runtmp): def test_simple_hp_indexed(runtmp): # test indexed execution with hp, indexed - protsigs = get_test_data('hp.zip') - output = runtmp.output('out.csv') - - protsigs_db = index_siglist(runtmp, protsigs, runtmp.output('db'), - ksize=19, moltype='hp', scaled=100) - - runtmp.sourmash('scripts', 'manysearch', protsigs, protsigs_db, - '-k', '19', '-s', '100', '--moltype', 'hp', - '-o', output, '-t', '0.01') + protsigs = get_test_data("hp.zip") + output = runtmp.output("out.csv") + + protsigs_db = index_siglist( + runtmp, protsigs, runtmp.output("db"), ksize=19, moltype="hp", scaled=100 + ) + + runtmp.sourmash( + "scripts", + "manysearch", + protsigs, + protsigs_db, + "-k", + "19", + "-s", + "100", + "--moltype", + "hp", + "-o", + output, + "-t", + "0.01", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): print(row) # identical? - if row['match_name'] == row['query_name']: - assert float(row['containment'] == 1.0) - assert float(row['query_containment_ani']) == 1.0 + if row["match_name"] == row["query_name"]: + assert float(row["containment"] == 1.0) + assert float(row["query_containment_ani"]) == 1.0 else: - # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - intersect_hashes = int(row['intersect_hashes']) - query_ani = float(row['query_containment_ani']) + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) cont = round(cont, 4) query_ani = round(query_ani, 4) print(q, m, f"{cont:.04}", intersect_hashes, f"{query_ani:.04}") - if q == 'GCA_001593925' and m == 'GCA_001593935': + if q == "GCA_001593925" and m == "GCA_001593935": assert cont == 0.747 assert intersect_hashes == 1724 assert query_ani == 0.9949 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert cont == 0.5994 assert intersect_hashes == 1724 assert query_ani == 0.9911 @@ -1081,19 +1230,18 @@ def test_simple_hp_indexed(runtmp): def test_pretty_print(runtmp): # test pretty-printing of output - query = get_test_data('hmp-queries.sig.zip') - against = get_test_data('hmp-against.sig.zip') + query = get_test_data("hmp-queries.sig.zip") + against = get_test_data("hmp-against.sig.zip") - outcsv = runtmp.output('xxx.csv') + outcsv = runtmp.output("xxx.csv") - runtmp.sourmash('scripts', 'manysearch', query, against, - '-o', outcsv) + runtmp.sourmash("scripts", "manysearch", query, against, "-o", outcsv) print(runtmp.last_result.out) # if this fails in the future, it might be because the order of the # output gets shuffled by multithreading. consider refactoring to # do line by line? - expected="""\ + expected = """\ query p_genome avg_abund p_metag metagenome name -------- -------- --------- ------- --------------- B. fragilis I1345 96.7% 7.3 27.5% CD136 @@ -1105,17 +1253,16 @@ def test_pretty_print(runtmp): def test_no_pretty_print(runtmp): # test turning off pretty-printing of output - query = get_test_data('hmp-queries.sig.zip') - against = get_test_data('hmp-against.sig.zip') + query = get_test_data("hmp-queries.sig.zip") + against = get_test_data("hmp-against.sig.zip") - outcsv = runtmp.output('xxx.csv') + outcsv = runtmp.output("xxx.csv") - runtmp.sourmash('scripts', 'manysearch', query, against, - '-o', outcsv, '-N') + runtmp.sourmash("scripts", "manysearch", query, against, "-o", outcsv, "-N") print(runtmp.last_result.out) # if this fails in the future, it might be because the order of the # output gets shuffled by multithreading. consider refactoring to # do line by line? - expected="p_genome" + expected = "p_genome" assert expected not in runtmp.last_result.out diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index bcd445e6..ef018a96 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -5,79 +5,82 @@ import sourmash from . import sourmash_tst_utils as utils -from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist, - index_siglist) +from .sourmash_tst_utils import ( + get_test_data, + make_file_list, + zip_siglist, + index_siglist, +) def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch') + runtmp.sourmash("scripts", "multisearch") - assert 'usage: multisearch' in runtmp.last_result.err + assert "usage: multisearch" in runtmp.last_result.err def test_simple_no_ani(runtmp, zip_query, zip_db): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_db: - against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("db.zip")) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert 'query_containment_ani' not in row - assert 'match_containment_ani' not in row - assert 'average_containment_ani' not in row - assert 'max_containment_ani' not in row + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert "query_containment_ani" not in row + assert "match_containment_ani" not in row + assert "average_containment_ani" not in row + assert "max_containment_ani" not in row else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) jaccard = round(jaccard, 4) cont = round(cont, 4) maxcont = round(maxcont, 4) print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}") - if q == 'NC_011665.1' and m == 'NC_009661.1': + if q == "NC_011665.1" and m == "NC_009661.1": assert jaccard == 0.3207 assert cont == 0.4828 assert maxcont == 0.4885 assert intersect_hashes == 2529 - if q == 'NC_009661.1' and m == 'NC_011665.1': + if q == "NC_009661.1" and m == "NC_011665.1": assert jaccard == 0.3207 assert cont == 0.4885 assert maxcont == 0.4885 @@ -86,65 +89,65 @@ def test_simple_no_ani(runtmp, zip_query, zip_db): def test_simple_ani(runtmp, zip_query, zip_db, indexed_query, indexed_against): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_db: - against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("db.zip")) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if indexed_query: - query_list = index_siglist(runtmp, query_list, runtmp.output('q_db')) + query_list = index_siglist(runtmp, query_list, runtmp.output("q_db")) if indexed_against: - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output, '--ani') + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output, "--ani" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) - assert float(row['match_containment_ani'] == 1.0) - assert float(row['average_containment_ani'] == 1.0) - assert float(row['max_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) - + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -153,9 +156,19 @@ def test_simple_ani(runtmp, zip_query, zip_db, indexed_query, indexed_against): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'NC_011665.1' and m == 'NC_009661.1': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "NC_011665.1" and m == "NC_009661.1": assert jaccard == 0.3207 assert cont == 0.4828 assert maxcont == 0.4885 @@ -165,7 +178,7 @@ def test_simple_ani(runtmp, zip_query, zip_db, indexed_query, indexed_against): assert avg_ani == 0.977 assert max_ani == 0.9772 - if q == 'NC_009661.1' and m == 'NC_011665.1': + if q == "NC_009661.1" and m == "NC_011665.1": assert jaccard == 0.3207 assert cont == 0.4885 assert maxcont == 0.4885 @@ -178,53 +191,53 @@ def test_simple_ani(runtmp, zip_query, zip_db, indexed_query, indexed_against): def test_simple_ani_list_of_zips(runtmp): # test basic execution against a pathlist file of zips - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.sig.zip') - sig63 = get_test_data('63.sig.zip') + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.sig.zip") + sig63 = get_test_data("63.sig.zip") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output, '--ani') + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output, "--ani" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) - assert float(row['match_containment_ani'] == 1.0) - assert float(row['average_containment_ani'] == 1.0) - assert float(row['max_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) - + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -233,9 +246,19 @@ def test_simple_ani_list_of_zips(runtmp): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'NC_011665.1' and m == 'NC_009661.1': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "NC_011665.1" and m == "NC_009661.1": assert jaccard == 0.3207 assert cont == 0.4828 assert maxcont == 0.4885 @@ -245,7 +268,7 @@ def test_simple_ani_list_of_zips(runtmp): assert avg_ani == 0.977 assert max_ani == 0.9772 - if q == 'NC_009661.1' and m == 'NC_011665.1': + if q == "NC_009661.1" and m == "NC_011665.1": assert jaccard == 0.3207 assert cont == 0.4885 assert maxcont == 0.4885 @@ -258,101 +281,110 @@ def test_simple_ani_list_of_zips(runtmp): def test_simple_ani_list_of_csv(runtmp): # test basic execution against a pathlist file of manifests - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.sig.zip') - sig63 = get_test_data('63.sig.zip') + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.sig.zip") + sig63 = get_test_data("63.sig.zip") - runtmp.sourmash('sig', 'collect', sig2, '-o', 'sig2.mf.csv', '-F', 'csv') - runtmp.sourmash('sig', 'collect', sig47, '-o', 'sig47.mf.csv', '-F', 'csv') - runtmp.sourmash('sig', 'collect', sig63, '-o', 'sig63.mf.csv', '-F', 'csv') + runtmp.sourmash("sig", "collect", sig2, "-o", "sig2.mf.csv", "-F", "csv") + runtmp.sourmash("sig", "collect", sig47, "-o", "sig47.mf.csv", "-F", "csv") + runtmp.sourmash("sig", "collect", sig63, "-o", "sig63.mf.csv", "-F", "csv") - make_file_list(query_list, ['sig2.mf.csv', 'sig47.mf.csv', 'sig63.mf.csv']) - make_file_list(against_list, ['sig2.mf.csv', 'sig47.mf.csv', 'sig63.mf.csv']) + make_file_list(query_list, ["sig2.mf.csv", "sig47.mf.csv", "sig63.mf.csv"]) + make_file_list(against_list, ["sig2.mf.csv", "sig47.mf.csv", "sig63.mf.csv"]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output, '--ani') + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output, "--ani" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) def test_simple_ani_standalone_manifest(runtmp): # test basic execution of a standalone manifest - against_list = runtmp.output('against.sig.zip') + against_list = runtmp.output("against.sig.zip") - sig2 = get_test_data('2.sig.zip') - sig47 = get_test_data('47.sig.zip') - sig63 = get_test_data('63.sig.zip') + sig2 = get_test_data("2.sig.zip") + sig47 = get_test_data("47.sig.zip") + sig63 = get_test_data("63.sig.zip") - runtmp.sourmash('sig', 'cat', sig2, sig47, sig63, '-o', against_list) + runtmp.sourmash("sig", "cat", sig2, sig47, sig63, "-o", against_list) - picklist_file = runtmp.output('pl.csv') - with open(picklist_file, 'w', newline='') as fp: + picklist_file = runtmp.output("pl.csv") + with open(picklist_file, "w", newline="") as fp: w = csv.writer(fp) - w.writerow(['ident']) - w.writerow(['CP001071.1']) + w.writerow(["ident"]) + w.writerow(["CP001071.1"]) # use picklist to create a standalone manifest - query_csv = runtmp.output('select.mf.csv') - runtmp.sourmash('sig', 'check', '--picklist', - f'{picklist_file}:ident:ident', - '-m', query_csv, against_list) - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'multisearch', query_csv, against_list, - '-o', output, '--ani') + query_csv = runtmp.output("select.mf.csv") + runtmp.sourmash( + "sig", + "check", + "--picklist", + f"{picklist_file}:ident:ident", + "-m", + query_csv, + against_list, + ) + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", "multisearch", query_csv, against_list, "-o", output, "--ani" + ) assert os.path.exists(output) df = pandas.read_csv(output) - assert len(df) == 1 # should only be the one, identical match. + assert len(df) == 1 # should only be the one, identical match. - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) - assert float(row['match_containment_ani'] == 1.0) - assert float(row['average_containment_ani'] == 1.0) - assert float(row['max_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) def test_simple_threshold(runtmp, zip_query, zip_db): # test with a simple threshold => only 3 results - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_db: - against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("db.zip")) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output, '-t', '0.5') + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output, "-t", "0.5" + ) assert os.path.exists(output) df = pandas.read_csv(output) @@ -361,26 +393,27 @@ def test_simple_threshold(runtmp, zip_query, zip_db): def test_simple_manifest(runtmp): # test with a simple threshold => only 3 results - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - query_mf = runtmp.output('qmf.csv') - against_mf = runtmp.output('amf.csv') + query_mf = runtmp.output("qmf.csv") + against_mf = runtmp.output("amf.csv") runtmp.sourmash("sig", "manifest", query_list, "-o", query_mf) runtmp.sourmash("sig", "manifest", against_list, "-o", against_mf) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_mf, against_mf, - '-o', output, '-t', '0.5') + runtmp.sourmash( + "scripts", "multisearch", query_mf, against_mf, "-o", output, "-t", "0.5" + ) assert os.path.exists(output) df = pandas.read_csv(output) @@ -389,33 +422,34 @@ def test_simple_manifest(runtmp): def test_lists_of_standalone_manifests(runtmp, capfd): # test pathlists of manifests - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - sig2_mf = runtmp.output('2.mf.csv') - runtmp.sourmash('sig', 'collect', sig2, '-o', sig2_mf, '-F', 'csv') - sig47_mf = runtmp.output('47.mf.csv') - runtmp.sourmash('sig', 'collect', sig47, '-o', sig47_mf, '-F', 'csv') - sig63_mf = runtmp.output('63.mf.csv') - runtmp.sourmash('sig', 'collect', sig63, '-o', sig63_mf, '-F', 'csv') + sig2_mf = runtmp.output("2.mf.csv") + runtmp.sourmash("sig", "collect", sig2, "-o", sig2_mf, "-F", "csv") + sig47_mf = runtmp.output("47.mf.csv") + runtmp.sourmash("sig", "collect", sig47, "-o", sig47_mf, "-F", "csv") + sig63_mf = runtmp.output("63.mf.csv") + runtmp.sourmash("sig", "collect", sig63, "-o", sig63_mf, "-F", "csv") make_file_list(query_list, [sig2_mf, sig47_mf, sig63_mf]) make_file_list(against_list, [sig2, sig47, sig63]) - query_mf = runtmp.output('qmf.csv') - against_mf = runtmp.output('amf.csv') + query_mf = runtmp.output("qmf.csv") + against_mf = runtmp.output("amf.csv") runtmp.sourmash("sig", "manifest", query_list, "-o", query_mf) runtmp.sourmash("sig", "manifest", against_list, "-o", against_mf) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_mf, against_mf, - '-o', output, '-t', '0.5') + runtmp.sourmash( + "scripts", "multisearch", query_mf, against_mf, "-o", output, "-t", "0.5" + ) assert os.path.exists(output) df = pandas.read_csv(output) @@ -427,44 +461,44 @@ def test_lists_of_standalone_manifests(runtmp, capfd): def test_missing_query(runtmp, capfd, zip_query): # test with a missing query list - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = runtmp.output('query.zip') + query_list = runtmp.output("query.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_sig_query(runtmp, capfd): # sig is ok as query now - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', sig2, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", sig2, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -476,98 +510,99 @@ def test_sig_query(runtmp, capfd): def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, "no-exist"]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 query paths failed to load. See error messages above." + in captured.err + ) def test_bad_query_3(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output('against.txt') + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_zip = runtmp.output('query.zip') + query_zip = runtmp.output("query.zip") # cp sig2 into query_zip - with open(query_zip, 'wb') as fp: - with open(sig2, 'rb') as fp2: + with open(query_zip, "wb") as fp: + with open(sig2, "rb") as fp2: fp.write(fp2.read()) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_zip, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_zip, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) - assert 'InvalidArchive' in captured.err + assert "InvalidArchive" in captured.err def test_missing_against(runtmp, capfd, zip_db): # test with a missing against list - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) # do not create against_list if zip_db: - #specify .zip but don't create the file - against_list = runtmp.output('db.zip') + # specify .zip but don't create the file + against_list = runtmp.output("db.zip") - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output + ) captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err + assert "Error: No such file or directory" in captured.err def test_sig_against(runtmp, capfd): # against can be sig now - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, sig2, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, sig2, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -579,44 +614,47 @@ def test_sig_against(runtmp, capfd): def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, "no-exist"]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) captured = capfd.readouterr() print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 search paths failed to load. See error messages above." + in captured.err + ) def test_empty_query(runtmp, capfd): # test with an empty query list - fail with error - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, []) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output + ) print(runtmp.last_result.err) captured = capfd.readouterr() @@ -626,186 +664,184 @@ def test_empty_query(runtmp, capfd): def test_nomatch_query_warn(runtmp, capfd, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1 = get_test_data('1.fa.k21.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63, sig1]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err + assert "WARNING: skipped 1 query paths - no compatible signatures" in captured.err def test_nomatch_query_exit(runtmp, capfd, zip_query): # test loading no matching sketches - do we error exit appropriately? - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1 = get_test_data('1.fa.k21.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig1]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output + ) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err - assert 'No query signatures loaded, exiting' in captured.err + assert "WARNING: skipped 1 query paths - no compatible signatures" in captured.err + assert "No query signatures loaded, exiting" in captured.err def test_nomatch_against(runtmp, capfd, zip_query): # test a non-matching (diff ksize) in against; do we get warning message? - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1 = get_test_data('1.fa.k21.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63, sig1]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output, '-k', '21') + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output, "-k", "21" + ) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 3 search paths - no compatible signatures' in captured.err - assert 'No search signatures loaded, exiting' in captured.err + assert "WARNING: skipped 3 search paths - no compatible signatures" in captured.err + assert "No search signatures loaded, exiting" in captured.err def test_load_only_one_bug(runtmp, capfd, zip_db): # check that we behave properly when presented with multiple against # sketches - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1_k31 = get_test_data('1.fa.k31.sig.gz') + sig1_k31 = get_test_data("1.fa.k31.sig.gz") # note: this was created as a 3-sketch-in-one-signature directly # via sourmash sketch dna -p k=21,k=31,k=51. - sig1_all = get_test_data('1.combined.sig.gz') + sig1_all = get_test_data("1.combined.sig.gz") make_file_list(query_list, [sig1_k31]) make_file_list(against_list, [sig1_all]) if zip_db: - against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("db.zip")) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) - assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path' in captured.err + assert not "WARNING: skipped 1 paths - no compatible signatures." in captured.err + assert not "WARNING: no compatible sketches in path" in captured.err def test_load_only_one_bug_as_query(runtmp, capfd, zip_query): # check that we behave properly when presented with multiple query # sketches in one file, with only one matching. - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig1_k31 = get_test_data('1.fa.k31.sig.gz') + sig1_k31 = get_test_data("1.fa.k31.sig.gz") # note: this was created as a 3-sketch-in-one-signature directly # via sourmash sketch dna -p k=21,k=31,k=51. - sig1_all = get_test_data('1.combined.sig.gz') + sig1_all = get_test_data("1.combined.sig.gz") make_file_list(query_list, [sig1_all]) make_file_list(against_list, [sig1_k31]) if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) - assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path ' in captured.err + assert not "WARNING: skipped 1 paths - no compatible signatures." in captured.err + assert not "WARNING: no compatible sketches in path " in captured.err def test_md5(runtmp, zip_query, zip_db): # test that md5s match what was in the original files, not downsampled etc. - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if zip_db: - against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip')) + against_list = zip_siglist(runtmp, against_list, runtmp.output("db.zip")) - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 5 - md5s = list(df['query_md5']) + md5s = list(df["query_md5"]) print(md5s) for query_file in (sig2, sig47, sig63): for ss in sourmash.load_file_as_signatures(query_file, ksize=31): assert ss.md5sum() in md5s - md5s = list(df['match_md5']) + md5s = list(df["match_md5"]) print(md5s) for against_file in (sig2, sig47, sig63): @@ -815,45 +851,57 @@ def test_md5(runtmp, zip_query, zip_db): def test_simple_prot(runtmp): # test basic execution with protein sigs - sigs = get_test_data('protein.zip') - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'multisearch', sigs, sigs, - '-o', output, '--moltype', 'protein', - '-k', '19', '--scaled', '100', '--ani') + sigs = get_test_data("protein.zip") + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "multisearch", + sigs, + sigs, + "-o", + output, + "--moltype", + "protein", + "-k", + "19", + "--scaled", + "100", + "--ani", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) - assert float(row['match_containment_ani'] == 1.0) - assert float(row['average_containment_ani'] == 1.0) - assert float(row['max_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -862,9 +910,20 @@ def test_simple_prot(runtmp): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.0434 assert cont == 0.1003 assert maxcont == 0.1003 @@ -874,7 +933,7 @@ def test_simple_prot(runtmp): assert avg_ani == 0.8781 assert max_ani == 0.886 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert jaccard == 0.0434 assert cont == 0.0712 assert maxcont == 0.1003 @@ -887,45 +946,57 @@ def test_simple_prot(runtmp): def test_simple_dayhoff(runtmp): # test basic execution with dayhoff sigs - sigs = get_test_data('dayhoff.zip') - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'multisearch', sigs, sigs, - '-o', output, '--moltype', 'dayhoff', - '-k', '19', '--scaled', '100', '--ani') + sigs = get_test_data("dayhoff.zip") + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "multisearch", + sigs, + sigs, + "-o", + output, + "--moltype", + "dayhoff", + "-k", + "19", + "--scaled", + "100", + "--ani", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) - assert float(row['match_containment_ani'] == 1.0) - assert float(row['average_containment_ani'] == 1.0) - assert float(row['max_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -934,9 +1005,20 @@ def test_simple_dayhoff(runtmp): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.1326 assert cont == 0.2815 assert maxcont == 0.2815 @@ -946,7 +1028,7 @@ def test_simple_dayhoff(runtmp): assert avg_ani == 0.9272 assert max_ani == 0.9355 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert jaccard == 0.1326 assert cont == 0.2004 assert maxcont == 0.2815 @@ -959,45 +1041,57 @@ def test_simple_dayhoff(runtmp): def test_simple_hp(runtmp): # test basic execution with hp sigs - sigs = get_test_data('hp.zip') - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'multisearch', sigs, sigs, - '-o', output, '--moltype', 'hp', - '-k', '19', '--scaled', '100', '--ani') + sigs = get_test_data("hp.zip") + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "multisearch", + sigs, + sigs, + "-o", + output, + "--moltype", + "hp", + "-k", + "19", + "--scaled", + "100", + "--ani", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 4 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # identical? - if row['match_name'] == row['query_name']: - assert row['query_md5'] == row['match_md5'], row - assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) - assert float(row['max_containment'] == 1.0) - assert float(row['query_containment_ani'] == 1.0) - assert float(row['match_containment_ani'] == 1.0) - assert float(row['average_containment_ani'] == 1.0) - assert float(row['max_containment_ani'] == 1.0) + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) else: # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -1006,9 +1100,20 @@ def test_simple_hp(runtmp): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.4983 assert cont == 0.747 assert maxcont == 0.747 @@ -1018,7 +1123,7 @@ def test_simple_hp(runtmp): assert avg_ani == 0.9791 assert max_ani == 0.9848 - if q == 'GCA_001593935' and m == 'GCA_001593925': + if q == "GCA_001593935" and m == "GCA_001593925": assert jaccard == 0.4983 assert cont == 0.5994 assert maxcont == 0.747 @@ -1031,79 +1136,88 @@ def test_simple_hp(runtmp): def test_simple_below_threshold(runtmp): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output, '--ani', '--threshold', '0.5') + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "multisearch", + query_list, + against_list, + "-o", + output, + "--ani", + "--threshold", + "0.5", + ) assert os.path.exists(output) - with open(output, 'r') as csvfile: + with open(output, "r") as csvfile: reader = csv.DictReader(csvfile) rows = list(reader) assert len(rows) == 3 for row in rows: # only identical reported print(row) - assert row['query_md5'] == row['match_md5'] - assert float(row['containment']) == 1.0 - assert float(row['jaccard']) == 1.0 - assert float(row['max_containment']) == 1.0 - assert float(row['query_containment_ani']) == 1.0 - assert float(row['match_containment_ani']) == 1.0 - assert float(row['average_containment_ani']) == 1.0 - assert float(row['max_containment_ani']) == 1.0 + assert row["query_md5"] == row["match_md5"] + assert float(row["containment"]) == 1.0 + assert float(row["jaccard"]) == 1.0 + assert float(row["max_containment"]) == 1.0 + assert float(row["query_containment_ani"]) == 1.0 + assert float(row["match_containment_ani"]) == 1.0 + assert float(row["average_containment_ani"]) == 1.0 + assert float(row["max_containment_ani"]) == 1.0 def test_mismatched_scaled_query(runtmp): # test what happens if query scaled is too high - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_list = runtmp.output('downsample.sig.zip') - runtmp.sourmash('sig', 'downsample', - '--scaled=10_000', sig2, sig47, sig63, - '-o', query_list) + query_list = runtmp.output("downsample.sig.zip") + runtmp.sourmash( + "sig", "downsample", "--scaled=10_000", sig2, sig47, sig63, "-o", query_list + ) make_file_list(against_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash("scripts", "multisearch", query_list, against_list, "-o", output) assert os.path.exists(output) def test_mismatched_scaled_against(runtmp): # test what happens if against scaled is too high - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - against_list = runtmp.output('downsample.sig.zip') - runtmp.sourmash('sig', 'downsample', - '--scaled=10_000', sig2, sig47, sig63, - '-o', against_list) + against_list = runtmp.output("downsample.sig.zip") + runtmp.sourmash( + "sig", "downsample", "--scaled=10_000", sig2, sig47, sig63, "-o", against_list + ) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, - '-o', output) + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output + ) diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index cba2a297..bd54c5cd 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -5,64 +5,67 @@ import sourmash from . import sourmash_tst_utils as utils -from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist, - index_siglist) +from .sourmash_tst_utils import ( + get_test_data, + make_file_list, + zip_siglist, + index_siglist, +) def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'pairwise') + runtmp.sourmash("scripts", "pairwise") - assert 'usage: pairwise' in runtmp.last_result.err + assert "usage: pairwise" in runtmp.last_result.err def test_simple_no_ani(runtmp, capfd, zip_query, indexed): # test basic execution! - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) if indexed: - query_list = index_siglist(runtmp, query_list, runtmp.output('db')) + query_list = index_siglist(runtmp, query_list, runtmp.output("db")) - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output, '-t', '-1') + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output, "-t", "-1") assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 3 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - assert 'query_containment_ani' not in row - assert 'match_containment_ani' not in row - assert 'average_containment_ani' not in row - assert 'max_containment_ani' not in row + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + assert "query_containment_ani" not in row + assert "match_containment_ani" not in row + assert "average_containment_ani" not in row + assert "max_containment_ani" not in row jaccard = round(jaccard, 4) cont = round(cont, 4) maxcont = round(maxcont, 4) print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}") - if q == 'NC_011665.1' and m == 'NC_009661.1': + if q == "NC_011665.1" and m == "NC_009661.1": assert jaccard == 0.3207 assert cont == 0.4828 assert maxcont == 0.4885 @@ -72,46 +75,49 @@ def test_simple_no_ani(runtmp, capfd, zip_query, indexed): print(captured.err) if indexed: - assert "WARNING: loading all sketches from a RocksDB into memory!" in captured.err + assert ( + "WARNING: loading all sketches from a RocksDB into memory!" in captured.err + ) def test_simple_ani(runtmp, zip_query): # test basic execution! - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output, '-t', '-1', '--ani') + runtmp.sourmash( + "scripts", "pairwise", query_list, "-o", output, "-t", "-1", "--ani" + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 3 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -120,9 +126,19 @@ def test_simple_ani(runtmp, zip_query): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'NC_011665.1' and m == 'NC_009661.1': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "NC_011665.1" and m == "NC_009661.1": assert jaccard == 0.3207 assert cont == 0.4828 assert maxcont == 0.4885 @@ -135,22 +151,20 @@ def test_simple_ani(runtmp, zip_query): def test_simple_threshold(runtmp, zip_query): # test with a simple threshold => only 3 results - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') - + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output, '-t', '0.1') + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output, "-t", "0.1") assert os.path.exists(output) df = pandas.read_csv(output) @@ -159,22 +173,21 @@ def test_simple_threshold(runtmp, zip_query): def test_simple_manifest(runtmp): # test with a simple threshold => only 3 results - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - query_mf = runtmp.output('qmf.csv') + query_mf = runtmp.output("qmf.csv") runtmp.sourmash("sig", "manifest", query_list, "-o", query_mf) - runtmp.sourmash('scripts', 'pairwise', query_mf, - '-o', output, '-t', '0.1') + runtmp.sourmash("scripts", "pairwise", query_mf, "-o", output, "-t", "0.1") assert os.path.exists(output) df = pandas.read_csv(output) @@ -183,13 +196,12 @@ def test_simple_manifest(runtmp): def test_sig_query(runtmp, capfd): # sig query is ok now, but fails bc only one sig - sig2 = get_test_data('2.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'pairwise', sig2, - '-o', output) + runtmp.sourmash("scripts", "pairwise", sig2, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -198,232 +210,240 @@ def test_sig_query(runtmp, capfd): def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") make_file_list(query_list, [sig2, sig47, "no-exist"]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output) + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output) captured = capfd.readouterr() print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 analysis paths failed to load. See error messages above." in captured.err + assert ( + "WARNING: 1 analysis paths failed to load. See error messages above." + in captured.err + ) def test_bad_query_2(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - query_zip = runtmp.output('query.zip') + query_zip = runtmp.output("query.zip") # cp sig2 into query_zip - with open(query_zip, 'wb') as fp: - with open(sig2, 'rb') as fp2: + with open(query_zip, "wb") as fp: + with open(sig2, "rb") as fp2: fp.write(fp2.read()) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'pairwise', query_zip, - '-o', output) + runtmp.sourmash("scripts", "pairwise", query_zip, "-o", output) captured = capfd.readouterr() print(captured.err) - assert 'InvalidArchive' in captured.err + assert "InvalidArchive" in captured.err def test_missing_query(runtmp, capfd, zip_db): # test with a missing query list - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output) - + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output) + captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory' in captured.err - + assert "Error: No such file or directory" in captured.err def test_empty_query(runtmp, capfd): # test with an empty query list - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, []) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output) + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output) captured = capfd.readouterr() - assert 'Error: No analysis signatures loaded, exiting.' in captured.err + assert "Error: No analysis signatures loaded, exiting." in captured.err def test_nomatch_query_warn(runtmp, capfd, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig1 = get_test_data('1.fa.k21.sig.gz') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63, sig1]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output) + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output) assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 analysis paths - no compatible signatures' in captured.err + assert ( + "WARNING: skipped 1 analysis paths - no compatible signatures" in captured.err + ) def test_nomatch_query_exit(runtmp, capfd, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig1 = get_test_data('1.fa.k21.sig.gz') - sig2 = get_test_data('2.fa.k21.sig.gz') + sig1 = get_test_data("1.fa.k21.sig.gz") + sig2 = get_test_data("2.fa.k21.sig.gz") make_file_list(query_list, [sig1, sig2]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output) + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output) captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 2 analysis paths - no compatible signatures' in captured.err - assert 'Error: No analysis signatures loaded, exiting.' in captured.err + assert ( + "WARNING: skipped 2 analysis paths - no compatible signatures" in captured.err + ) + assert "Error: No analysis signatures loaded, exiting." in captured.err def test_load_only_one_bug(runtmp, capfd, zip_db): # check that we behave properly when presented with multiple query # sketches - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig1_k31 = get_test_data('1.fa.k31.sig.gz') + sig1_k31 = get_test_data("1.fa.k31.sig.gz") # note: this was created as a 3-sketch-in-one-signature directly # via sourmash sketch dna -p k=21,k=31,k=51. - sig1_all = get_test_data('1.combined.sig.gz') + sig1_all = get_test_data("1.combined.sig.gz") make_file_list(query_list, [sig1_all, sig1_k31]) if zip_db: - query_list = zip_siglist(runtmp, query_list, runtmp.output('db.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("db.zip")) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output) + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output) assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) - assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path ' in captured.err + assert not "WARNING: skipped 1 paths - no compatible signatures." in captured.err + assert not "WARNING: no compatible sketches in path " in captured.err def test_md5(runtmp, zip_query): # test that md5s match what was in the original files, not downsampled etc. - query_list = runtmp.output('query.txt') + query_list = runtmp.output("query.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output, "-t", "-0.1") + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output, "-t", "-0.1") assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 3 - md5s = list(df['query_md5']) + list(df['match_md5']) + md5s = list(df["query_md5"]) + list(df["match_md5"]) print(f"md5s: {md5s}") for query_file in (sig2, sig47, sig63): for ss in sourmash.load_file_as_signatures(query_file, ksize=31): assert ss.md5sum() in md5s - md5s = list(df['match_md5']) + md5s = list(df["match_md5"]) print(md5s) def test_simple_prot_ani(runtmp): # test basic execution with protein sigs - sigs = get_test_data('protein.zip') - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'pairwise', sigs, - '-o', output, '--moltype', 'protein', - '-k', '19', '--scaled', '100', '--ani') + sigs = get_test_data("protein.zip") + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "pairwise", + sigs, + "-o", + output, + "--moltype", + "protein", + "-k", + "19", + "--scaled", + "100", + "--ani", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 1 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -432,9 +452,20 @@ def test_simple_prot_ani(runtmp): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.0434 assert cont == 0.1003 assert maxcont == 0.1003 @@ -447,33 +478,44 @@ def test_simple_prot_ani(runtmp): def test_simple_dayhoff_ani(runtmp): # test basic execution with dayhoff sigs - sigs = get_test_data('dayhoff.zip') - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'pairwise', sigs, - '-o', output, '--moltype', 'dayhoff', - '-k', '19', '--scaled', '100', '--ani') + sigs = get_test_data("dayhoff.zip") + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "pairwise", + sigs, + "-o", + output, + "--moltype", + "dayhoff", + "-k", + "19", + "--scaled", + "100", + "--ani", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 1 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -482,9 +524,20 @@ def test_simple_dayhoff_ani(runtmp): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.1326 assert cont == 0.2815 assert maxcont == 0.2815 @@ -497,33 +550,44 @@ def test_simple_dayhoff_ani(runtmp): def test_simple_hp_ani(runtmp): # test basic execution with hp sigs - sigs = get_test_data('hp.zip') - - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'pairwise', sigs, - '-o', output, '--moltype', 'hp', - '-k', '19', '--scaled', '100', '--ani') + sigs = get_test_data("hp.zip") + + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "pairwise", + sigs, + "-o", + output, + "--moltype", + "hp", + "-k", + "19", + "--scaled", + "100", + "--ani", + ) assert os.path.exists(output) df = pandas.read_csv(output) assert len(df) == 1 - dd = df.to_dict(orient='index') + dd = df.to_dict(orient="index") print(dd) for idx, row in dd.items(): # confirm hand-checked numbers - q = row['query_name'].split()[0] - m = row['match_name'].split()[0] - cont = float(row['containment']) - jaccard = float(row['jaccard']) - maxcont = float(row['max_containment']) - intersect_hashes = int(row['intersect_hashes']) - q1_ani = float(row['query_containment_ani']) - q2_ani = float(row['match_containment_ani']) - avg_ani = float(row['average_containment_ani']) - max_ani = float(row['max_containment_ani']) + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + q1_ani = float(row["query_containment_ani"]) + q2_ani = float(row["match_containment_ani"]) + avg_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) jaccard = round(jaccard, 4) cont = round(cont, 4) @@ -532,9 +596,20 @@ def test_simple_hp_ani(runtmp): q2_ani = round(q2_ani, 4) avg_ani = round(avg_ani, 4) max_ani = round(max_ani, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}", intersect_hashes, f"{q1_ani:.04}", f"{q2_ani:.04}", f"{avg_ani:.04}", f"{max_ani:.04}") - - if q == 'GCA_001593925' and m == 'GCA_001593935': + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + intersect_hashes, + f"{q1_ani:.04}", + f"{q2_ani:.04}", + f"{avg_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "GCA_001593925" and m == "GCA_001593935": assert jaccard == 0.4983 assert cont == 0.747 assert maxcont == 0.747 @@ -547,22 +622,23 @@ def test_simple_hp_ani(runtmp): def test_simple_below_threshold(runtmp): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') + output = runtmp.output("out.csv") - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output, '--ani', '--threshold', '0.5') + runtmp.sourmash( + "scripts", "pairwise", query_list, "-o", output, "--ani", "--threshold", "0.5" + ) assert os.path.exists(output) - with open(output, 'r') as csvfile: + with open(output, "r") as csvfile: reader = csv.reader(csvfile) rows = list(reader) print(rows) @@ -571,69 +647,84 @@ def test_simple_below_threshold(runtmp): def test_simple_below_threshold_write_all(runtmp): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output, '--ani', '--threshold', '0.5', - '--write-all') + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "pairwise", + query_list, + "-o", + output, + "--ani", + "--threshold", + "0.5", + "--write-all", + ) assert os.path.exists(output) - with open(output, 'r') as csvfile: + with open(output, "r") as csvfile: reader = csv.DictReader(csvfile) rows = list(reader) print(rows) assert len(rows) == 3 for row in rows: - assert float(row['query_containment_ani']) == 1.0 - assert float(row['match_containment_ani']) == 1.0 - assert float(row['average_containment_ani']) == 1.0 - assert float(row['max_containment_ani']) == 1.0 - assert float(row['containment']) == 1.0 - assert float(row['max_containment']) == 1.0 - assert float(row['jaccard']) == 1.0 - assert row['query_name'] == row['match_name'] - assert row['query_md5'] == row['match_md5'] + assert float(row["query_containment_ani"]) == 1.0 + assert float(row["match_containment_ani"]) == 1.0 + assert float(row["average_containment_ani"]) == 1.0 + assert float(row["max_containment_ani"]) == 1.0 + assert float(row["containment"]) == 1.0 + assert float(row["max_containment"]) == 1.0 + assert float(row["jaccard"]) == 1.0 + assert row["query_name"] == row["match_name"] + assert row["query_md5"] == row["match_md5"] def test_simple_below_threshold_write_all_no_ani(runtmp): # test basic execution! - query_list = runtmp.output('query.txt') - against_list = runtmp.output('against.txt') + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_file_list(query_list, [sig2, sig47, sig63]) - output = runtmp.output('out.csv') - - runtmp.sourmash('scripts', 'pairwise', query_list, - '-o', output, '--threshold', '0.5', - '--write-all') + output = runtmp.output("out.csv") + + runtmp.sourmash( + "scripts", + "pairwise", + query_list, + "-o", + output, + "--threshold", + "0.5", + "--write-all", + ) assert os.path.exists(output) - with open(output, 'r') as csvfile: + with open(output, "r") as csvfile: reader = csv.DictReader(csvfile) rows = list(reader) print(rows) assert len(rows) == 3 for row in rows: - assert 'query_containment_ani' not in row.keys() - assert 'match_containment_ani' not in row.keys() - assert 'average_containment_ani' not in row.keys() - assert 'max_containment_ani' not in row.keys() - assert float(row['containment']) == 1.0 - assert float(row['max_containment']) == 1.0 - assert float(row['jaccard']) == 1.0 - assert row['query_name'] == row['match_name'] - assert row['query_md5'] == row['match_md5'] + assert "query_containment_ani" not in row.keys() + assert "match_containment_ani" not in row.keys() + assert "average_containment_ani" not in row.keys() + assert "max_containment_ani" not in row.keys() + assert float(row["containment"]) == 1.0 + assert float(row["max_containment"]) == 1.0 + assert float(row["jaccard"]) == 1.0 + assert row["query_name"] == row["match_name"] + assert row["query_md5"] == row["match_md5"] diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index 98f08058..3c610a56 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -9,55 +9,65 @@ def get_test_data(filename): thisdir = os.path.dirname(__file__) - return os.path.join(thisdir, 'test-data', filename) + return os.path.join(thisdir, "test-data", filename) -def make_assembly_csv(filename, genome_paths, protein_paths = []): +def make_assembly_csv(filename, genome_paths, protein_paths=[]): # equalize path lengths by adding "". - names = [os.path.basename(x).split('.fa')[0] for x in genome_paths] + names = [os.path.basename(x).split(".fa")[0] for x in genome_paths] if len(protein_paths) < len(genome_paths): - protein_paths.extend(["" for _ in range(len(genome_paths) - len(protein_paths))]) + protein_paths.extend( + ["" for _ in range(len(genome_paths) - len(protein_paths))] + ) elif len(genome_paths) < len(protein_paths): genome_paths.extend(["" for _ in range(len(protein_paths) - len(genome_paths))]) - names = [os.path.basename(x).split('.fa')[0] for x in protein_paths] + names = [os.path.basename(x).split(".fa")[0] for x in protein_paths] - with open(filename, 'wt') as fp: + with open(filename, "wt") as fp: fp.write("name,genome_filename,protein_filename\n") for name, genome_path, protein_path in zip(names, genome_paths, protein_paths): fp.write("{},{},{}\n".format(name, genome_path, protein_path)) -def make_reads_csv(filename, reads_tuples = []): + +def make_reads_csv(filename, reads_tuples=[]): # reads tuples should be (name,read1,read2) - with open(filename, 'wt') as fp: + with open(filename, "wt") as fp: fp.write("name,read1,read2\n") - for (name, read1, read2) in reads_tuples: + for name, read1, read2 in reads_tuples: print(f"{name},{read1},{read2}") fp.write("{},{},{}\n".format(name, read1, read2)) def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch') + runtmp.sourmash("scripts", "manysketch") - assert 'usage: manysketch' in runtmp.last_result.err + assert "usage: manysketch" in runtmp.last_result.err def test_manysketch_simple(runtmp): - fa_csv = runtmp.output('db-fa.txt') + fa_csv = runtmp.output("db-fa.txt") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -67,21 +77,28 @@ def test_manysketch_simple(runtmp): def test_manysketch_mult_k(runtmp): - fa_csv = runtmp.output('db-fa.txt') + fa_csv = runtmp.output("db-fa.txt") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=21,k=31,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=21,k=31,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -91,29 +108,38 @@ def test_manysketch_mult_k(runtmp): names = [sig.name for sig in sigs] print(names) - assert names.count('short') == 2 - assert names.count('short2') == 2 - assert names.count('short3') == 2 + assert names.count("short") == 2 + assert names.count("short2") == 2 + assert names.count("short3") == 2 def test_manysketch_mult_k_2(runtmp): - fa_csv = runtmp.output('db-fa.txt') + fa_csv = runtmp.output("db-fa.txt") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('db.zip') - - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=21,scaled=1", - '--param-str', "dna,k=31,scaled=1", - '--param-str', "dna,k=21,scaled=1") + output = runtmp.output("db.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=21,scaled=1", + "--param-str", + "dna,k=31,scaled=1", + "--param-str", + "dna,k=21,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -123,29 +149,37 @@ def test_manysketch_mult_k_2(runtmp): names = [sig.name for sig in sigs] print(names) - assert names.count('short') == 2 - assert names.count('short2') == 2 - assert names.count('short3') == 2 + assert names.count("short") == 2 + assert names.count("short2") == 2 + assert names.count("short3") == 2 def test_manysketch_mult_moltype(runtmp): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') - protfa1 = get_test_data('short-protein.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + protfa1 = get_test_data("short-protein.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=21,scaled=1", - '--param-str', "protein,k=10,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=21,scaled=1", + "--param-str", + "protein,k=10,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -154,39 +188,50 @@ def test_manysketch_mult_moltype(runtmp): assert len(sigs) == 4 # check moltypes, etc! for sig in sigs: - if sig.name == 'short': + if sig.name == "short": if sig.minhash.is_dna: assert sig.minhash.ksize == 21 assert sig.minhash.scaled == 1 assert sig.md5sum() == "1474578c5c46dd09da4c2df29cf86621" else: - assert sig.name == 'short' + assert sig.name == "short" assert sig.minhash.ksize == 10 assert sig.minhash.scaled == 1 assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310" else: - assert sig.name in ['short', 'short2', 'short3'] + assert sig.name in ["short", "short2", "short3"] assert sig.minhash.ksize == 21 assert sig.minhash.scaled == 1 assert sig.minhash.is_dna - assert sig.md5sum() in ["4efeebd26644278e36b9553e018a851a","f85747ac4f473c4a71c1740d009f512b"] + assert sig.md5sum() in [ + "4efeebd26644278e36b9553e018a851a", + "f85747ac4f473c4a71c1740d009f512b", + ] def test_manysketch_mult_moltype_protein(runtmp): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - protfa1 = get_test_data('short-protein.fa') + protfa1 = get_test_data("short-protein.fa") make_assembly_csv(fa_csv, [], [protfa1]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dayhoff,k=10,scaled=1", - '--param-str', "hp,k=24,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dayhoff,k=10,scaled=1", + "--param-str", + "hp,k=24,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -200,57 +245,74 @@ def test_manysketch_mult_moltype_protein(runtmp): assert sig.name == "short-protein" if sig.minhash.dayhoff: assert sig.md5sum() == "320464775fe704d9f938a8c63d8dd722" - total_checked+=1 + total_checked += 1 elif sig.minhash.hp: assert sig.md5sum() == "e8ccc6ca7ad560072f51be631d1c39c0" - total_checked+=1 + total_checked += 1 assert total_checked == 2 def test_manysketch_only_incompatible_fastas(runtmp, capfd): # provide dna, protein fastas, but only sketch protein (skip protein fastas!) - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "protein,k=10,scaled=1") - - assert os.path.exists(output) # output will still exist - is this desired? - assert not runtmp.last_result.out # stdout should be empty + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "protein,k=10,scaled=1", + ) + + assert os.path.exists(output) # output will still exist - is this desired? + assert not runtmp.last_result.out # stdout should be empty captured = capfd.readouterr() print(captured.err) - assert 'DONE. Processed 3 fasta files' in captured.err - assert 'Error: No fasta files compatible with provided sketch parameters: no signatures created.' in captured.err + assert "DONE. Processed 3 fasta files" in captured.err + assert ( + "Error: No fasta files compatible with provided sketch parameters: no signatures created." + in captured.err + ) def test_manysketch_skip_incompatible_fastas(runtmp, capfd): # provide dna, protein fastas, but only sketch protein (skip protein fastas!) - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') - protfa1 = get_test_data('short-protein.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + protfa1 = get_test_data("short-protein.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "protein,k=10,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "protein,k=10,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -265,19 +327,18 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd): assert sig.minhash.ksize == 10 assert sig.minhash.scaled == 1 assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310" - assert 'DONE. Processed 4 fasta files' in captured.err - assert 'WARNING: 3 fasta files skipped - no compatible signatures.' in captured.err + assert "DONE. Processed 4 fasta files" in captured.err + assert "WARNING: 3 fasta files skipped - no compatible signatures." in captured.err def test_manysketch_missing_fa_csv(runtmp, capfd): # test missing fa_csv file - fa_csv = runtmp.output('fa_csv.txt') - output = runtmp.output('out.zip') + fa_csv = runtmp.output("fa_csv.txt") + output = runtmp.output("out.zip") # make_file_list(fa_csv, []) # don't make fa_csv file with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa_csv, - '-o', output) + runtmp.sourmash("scripts", "manysketch", fa_csv, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -286,18 +347,18 @@ def test_manysketch_missing_fa_csv(runtmp, capfd): def test_manysketch_bad_fa_csv(runtmp, capfd): # siglist instead of fastalist - siglist = runtmp.output('db-sigs.txt') + siglist = runtmp.output("db-sigs.txt") - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") make_assembly_csv(siglist, [sig2, sig47, sig63]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', siglist, '-o', output) + runtmp.sourmash("scripts", "manysketch", siglist, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -306,15 +367,15 @@ def test_manysketch_bad_fa_csv(runtmp, capfd): def test_manysketch_bad_fa_csv_2(runtmp, capfd): # bad file within filelist - siglist = runtmp.output('bad.txt') + siglist = runtmp.output("bad.txt") # fa_file = runtmp.output("bad.fa") make_assembly_csv(siglist, ["bad2.fa"]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', siglist, '-o', output) + runtmp.sourmash("scripts", "manysketch", siglist, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -324,13 +385,12 @@ def test_manysketch_bad_fa_csv_2(runtmp, capfd): def test_manysketch_bad_fa_csv_3(runtmp, capfd): # test sketch with fasta provided instead of fa_csv - output = runtmp.output('out.zip') - fa1 = get_test_data('short.fa') + output = runtmp.output("out.zip") + fa1 = get_test_data("short.fa") print(fa1) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa1, - '-o', output) + runtmp.sourmash("scripts", "manysketch", fa1, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -340,89 +400,98 @@ def test_manysketch_bad_fa_csv_3(runtmp, capfd): def test_manysketch_bad_fa_csv_4(runtmp, capfd): # test sketch with improperly formatted fa_csv - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') - protfa1 = get_test_data('short-protein.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + protfa1 = get_test_data("short-protein.fa") # make file csv but don't fill empty protein rows with ,"" make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) g_fa = [fa1, fa2, fa3] p_fa = [protfa1] - with open(fa_csv, 'wt') as fp: + with open(fa_csv, "wt") as fp: fp.write("name,genome_filename,protein_filename\n") for i, g in enumerate(g_fa): - name = os.path.basename(g).split('.fa')[0] + name = os.path.basename(g).split(".fa")[0] if i < len(p_fa): p = p_fa[i] fp.write("{},{},{}\n".format(name, g, p)) else: - fp.write("{},{}\n".format(name, g)) # missing prot path, no trailing comma + fp.write( + "{},{}\n".format(name, g) + ) # missing prot path, no trailing comma - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa_csv, - '-o', output) + runtmp.sourmash("scripts", "manysketch", fa_csv, "-o", output) captured = capfd.readouterr() print(captured.err) - assert 'found record with 2 fields' in captured.err + assert "found record with 2 fields" in captured.err assert "Could not load fromfile csv" in captured.err def test_manysketch_bad_param_str_moltype(runtmp, capfd): # no moltype provided in param str - fa_csv = runtmp.output('db-fa.txt') + fa_csv = runtmp.output("db-fa.txt") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('out.zip') + output = runtmp.output("out.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa_csv, - '-o', output, '-p', 'k=31,scaled=100') + runtmp.sourmash( + "scripts", "manysketch", fa_csv, "-o", output, "-p", "k=31,scaled=100" + ) captured = capfd.readouterr() print(captured.err) - assert "Error parsing params string: No moltype provided in params string k=31,scaled=100" in captured.err + assert ( + "Error parsing params string: No moltype provided in params string k=31,scaled=100" + in captured.err + ) assert "Failed to parse params string" in captured.err def test_manysketch_bad_param_str_ksize(runtmp, capfd): # no ksize provided in param str - fa_csv = runtmp.output('db-fa.txt') + fa_csv = runtmp.output("db-fa.txt") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('out.zip') + output = runtmp.output("out.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa_csv, - '-o', output, '-p', 'dna,scaled=100') + runtmp.sourmash( + "scripts", "manysketch", fa_csv, "-o", output, "-p", "dna,scaled=100" + ) captured = capfd.readouterr() print(captured.err) - assert "Error parsing params string: No ksizes provided in params string dna,scaled=100" in captured.err + assert ( + "Error parsing params string: No ksizes provided in params string dna,scaled=100" + in captured.err + ) assert "Failed to parse params string" in captured.err + def test_manysketch_empty_fa_csv(runtmp, capfd): # test empty fa_csv file - fa_csv = runtmp.output('fa.txt') - output = runtmp.output('out.zip') - make_assembly_csv(fa_csv, []) # empty + fa_csv = runtmp.output("fa.txt") + output = runtmp.output("out.zip") + make_assembly_csv(fa_csv, []) # empty with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa_csv, - '-o', output) + runtmp.sourmash("scripts", "manysketch", fa_csv, "-o", output) captured = capfd.readouterr() print(captured.err) @@ -430,23 +499,31 @@ def test_manysketch_empty_fa_csv(runtmp, capfd): def test_manysketch_duplicated_rows(runtmp, capfd): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') - protfa1 = get_test_data('short-protein.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + protfa1 = get_test_data("short-protein.fa") make_assembly_csv(fa_csv, [fa1, fa1, fa1, fa3]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=21,scaled=1", - '--param-str', "protein,k=10,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=21,scaled=1", + "--param-str", + "protein,k=10,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -460,21 +537,22 @@ def test_manysketch_duplicated_rows(runtmp, capfd): def test_manysketch_N_in_dna(runtmp): # make sure we can handle Ns in DNA sequences - fa_csv = runtmp.output('db-fa.txt') - fa1 = runtmp.output('bad.fa') - with open (fa1, 'wt') as fp: + fa_csv = runtmp.output("db-fa.txt") + fa1 = runtmp.output("bad.fa") + with open(fa1, "wt") as fp: fp.write(">bad\n") fp.write("ACAGTN\n") make_assembly_csv(fa_csv, [fa1]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=4,scaled=1") + runtmp.sourmash( + "scripts", "manysketch", fa_csv, "-o", output, "--param-str", "dna,k=4,scaled=1" + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -485,23 +563,30 @@ def test_manysketch_N_in_dna(runtmp): def test_zip_manifest(runtmp, capfd): # test basic manifest-generating functionality. - fa_csv = runtmp.output('db-fa.txt') + fa_csv = runtmp.output("db-fa.txt") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('db.zip') - - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1") + output = runtmp.output("db.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + ) loader = sourmash.load_file_as_index(output) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -511,37 +596,44 @@ def test_zip_manifest(runtmp, capfd): assert len(manifest) == len(rows) assert len(manifest) == 3 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '9191284a3a23a913d8d410f3d53ce8f0' in md5_list - assert 'd663bb55b2a0f8782c53c8af89f20fff' in md5_list - assert 'bf752903d635b1eb83c53fe4aae951db' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "9191284a3a23a913d8d410f3d53ce8f0" in md5_list + assert "d663bb55b2a0f8782c53c8af89f20fff" in md5_list + assert "bf752903d635b1eb83c53fe4aae951db" in md5_list for sig in siglist: assert sig in manifest assert sig.minhash.ksize == 31 - assert sig.minhash.moltype == 'DNA' + assert sig.minhash.moltype == "DNA" assert sig.minhash.scaled == 1 def test_protein_zip_manifest(runtmp, capfd): # test basic manifest-generating functionality. - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short-protein.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short-protein.fa") make_assembly_csv(fa_csv, [fa1], [fa2]) - output = runtmp.output('db.zip') - - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "protein,k=10,scaled=1") + output = runtmp.output("db.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "protein,k=10,scaled=1", + ) loader = sourmash.load_file_as_index(output) rows = [] siglist = [] # make manifest via sourmash python code - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -551,76 +643,103 @@ def test_protein_zip_manifest(runtmp, capfd): assert len(manifest) == len(rows) assert len(manifest) == 1 - md5_list = [ row['md5'] for row in manifest.rows ] - assert 'eb4467d11e0ecd2dbde4193bfc255310' in md5_list - ksize_list = [ row['ksize'] for row in manifest.rows ] - assert 10 in ksize_list # manifest ksizes are human-readable (k, not k*3) - scaled_list = [ row['scaled'] for row in manifest.rows ] + md5_list = [row["md5"] for row in manifest.rows] + assert "eb4467d11e0ecd2dbde4193bfc255310" in md5_list + ksize_list = [row["ksize"] for row in manifest.rows] + assert 10 in ksize_list # manifest ksizes are human-readable (k, not k*3) + scaled_list = [row["scaled"] for row in manifest.rows] assert 1 in scaled_list - moltype_list = [ row['moltype'] for row in manifest.rows ] + moltype_list = [row["moltype"] for row in manifest.rows] assert "protein" in moltype_list for sig in siglist: assert sig in manifest - assert sig.minhash.ksize == 10 # minhash stores k*3, but does the conversion back for us - assert sig.minhash.moltype == 'protein' + assert ( + sig.minhash.ksize == 10 + ) # minhash stores k*3, but does the conversion back for us + assert sig.minhash.moltype == "protein" assert sig.minhash.scaled == 1 def test_manysketch_singleton(runtmp): - fa_csv = runtmp.output('db-fa.txt') + fa_csv = runtmp.output("db-fa.txt") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") make_assembly_csv(fa_csv, [fa1, fa2, fa3]) - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1", "--singleton") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + "--singleton", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) print(sigs) assert len(sigs) == 4 - singleton_sketch = runtmp.output('short3.sig') - runtmp.sourmash('sketch', 'dna', fa3, '-o', singleton_sketch, - '--param-str', "dna,k=31,scaled=1", "--singleton") + singleton_sketch = runtmp.output("short3.sig") + runtmp.sourmash( + "sketch", + "dna", + fa3, + "-o", + singleton_sketch, + "--param-str", + "dna,k=31,scaled=1", + "--singleton", + ) ss_sketch = sourmash.load_signatures(singleton_sketch) ss_sketch1 = next(ss_sketch) ss_sketch2 = next(ss_sketch) - expected_signames = ['shortName', 'tr1 4', 'firstname', 'other'] + expected_signames = ["shortName", "tr1 4", "firstname", "other"] for sig in sigs: assert sig.name in expected_signames - if sig.name == 'firstname': + if sig.name == "firstname": assert sig == ss_sketch1 - if sig.name == 'other': + if sig.name == "other": assert sig == ss_sketch2 def test_manysketch_reads(runtmp, capfd): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") - make_reads_csv(fa_csv, [("short", fa1, fa2), ('short3', fa3, '')]) # make sure we can just do read1 alone + make_reads_csv( + fa_csv, [("short", fa1, fa2), ("short3", fa3, "")] + ) # make sure we can just do read1 alone - output = runtmp.output('db.zip') + output = runtmp.output("db.zip") - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty captured = capfd.readouterr() print(captured.out) print(captured.err) @@ -633,40 +752,72 @@ def test_manysketch_reads(runtmp, capfd): print(sigs) assert len(sigs) == 2 - s1 = runtmp.output('short.sig') - runtmp.sourmash('sketch', 'dna', fa1, fa2, '-o', s1, - '--param-str', "k=31,scaled=1", '--name', 'short') + s1 = runtmp.output("short.sig") + runtmp.sourmash( + "sketch", + "dna", + fa1, + fa2, + "-o", + s1, + "--param-str", + "k=31,scaled=1", + "--name", + "short", + ) sig1 = sourmash.load_one_signature(s1) - s3 = runtmp.output('short3.sig') - runtmp.sourmash('sketch', 'dna', fa3, '-o', s3, - '--param-str', "k=31,scaled=1", '--name', 'short3') + s3 = runtmp.output("short3.sig") + runtmp.sourmash( + "sketch", + "dna", + fa3, + "-o", + s3, + "--param-str", + "k=31,scaled=1", + "--name", + "short3", + ) sig2 = sourmash.load_one_signature(s3) - expected_signames = ['short', 'short3'] + expected_signames = ["short", "short3"] for sig in sigs: assert sig.name in expected_signames - if sig.name == 'short': + if sig.name == "short": assert sig == sig1 - if sig.name == 'short3': + if sig.name == "short3": assert sig == sig2 def test_manysketch_reads_singleton(runtmp, capfd): - fa_csv = runtmp.output('db-fa.csv') - - fa1 = get_test_data('short.fa') - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') - - make_reads_csv(fa_csv, [("short", fa2, fa3), ]) - - output = runtmp.output('db.zip') - - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1", '--singleton') + fa_csv = runtmp.output("db-fa.csv") + + fa1 = get_test_data("short.fa") + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + + make_reads_csv( + fa_csv, + [ + ("short", fa2, fa3), + ], + ) + + output = runtmp.output("db.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + "--singleton", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty captured = capfd.readouterr() print(captured.out) print(captured.err) @@ -679,52 +830,77 @@ def test_manysketch_reads_singleton(runtmp, capfd): print(sigs) assert len(sigs) == 3 - s1 = runtmp.output('singleton.sig') - runtmp.sourmash('sketch', 'dna', fa2, fa3, '-o', s1, - '--param-str', "k=31,scaled=1", '--singleton') + s1 = runtmp.output("singleton.sig") + runtmp.sourmash( + "sketch", + "dna", + fa2, + fa3, + "-o", + s1, + "--param-str", + "k=31,scaled=1", + "--singleton", + ) ss = sourmash.load_signatures(s1) ss_sketch1 = next(ss) ss_sketch2 = next(ss) ss_sketch3 = next(ss) - expected_signames = ['tr1 4', 'firstname', 'other'] + expected_signames = ["tr1 4", "firstname", "other"] for sig in sigs: assert sig.name in expected_signames - if sig.name == 'tr1 4': + if sig.name == "tr1 4": assert sig == ss_sketch1 - elif sig.name == 'firstname': + elif sig.name == "firstname": assert sig == ss_sketch2 - elif sig.name == 'other': + elif sig.name == "other": assert sig == ss_sketch3 def test_manysketch_prefix(runtmp, capfd): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') + fa1 = get_test_data("short.fa") fa_path = os.path.dirname(fa1) - dna_prefix = os.path.join(fa_path, "short*fa") # need to avoid matching short-protein.fa + dna_prefix = os.path.join( + fa_path, "short*fa" + ) # need to avoid matching short-protein.fa prot_prefix = os.path.join(fa_path, "*protein.fa") # make prefix input file - with open(fa_csv, 'wt') as fp: + with open(fa_csv, "wt") as fp: fp.write("name,input_moltype,prefix,exclude\n") - fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa - fp.write(f"short_protein,protein,{prot_prefix},\n") # short-protein.fa only - - output = runtmp.output('prefix.zip') - - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1") + fp.write( + f"short,DNA,{dna_prefix},{prot_prefix}\n" + ) # short.fa, short2.fa, short3.fa, short-protein.fa + fp.write(f"short_protein,protein,{prot_prefix},\n") # short-protein.fa only + + output = runtmp.output("prefix.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + "-p", + "protein,k=10,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty captured = capfd.readouterr() print(captured.out) print(captured.err) - assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out + assert ( + "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." + in captured.out + ) assert "DONE. Processed 4 fasta files" in captured.err idx = sourmash.load_file_as_index(output) @@ -734,55 +910,93 @@ def test_manysketch_prefix(runtmp, capfd): assert len(sigs) == 2 # make same sigs with sourmash - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') - fa4 = get_test_data('short-protein.fa') - s1 = runtmp.output('short.sig') - runtmp.sourmash('sketch', 'dna', fa1, fa2, fa3, '-o', s1, - '--param-str', "dna,k=31,scaled=1", '--name', 'short') + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + fa4 = get_test_data("short-protein.fa") + s1 = runtmp.output("short.sig") + runtmp.sourmash( + "sketch", + "dna", + fa1, + fa2, + fa3, + "-o", + s1, + "--param-str", + "dna,k=31,scaled=1", + "--name", + "short", + ) sig1 = sourmash.load_one_signature(s1) - s2 = runtmp.output('short-protein.sig') - runtmp.sourmash('sketch', 'protein', fa4, '-o', s2, - '--param-str', "protein,k=10,scaled=1", '--name', 'short_protein') + s2 = runtmp.output("short-protein.sig") + runtmp.sourmash( + "sketch", + "protein", + fa4, + "-o", + s2, + "--param-str", + "protein,k=10,scaled=1", + "--name", + "short_protein", + ) sig2 = sourmash.load_one_signature(s2) - expected_signames = ['short', 'short_protein'] + expected_signames = ["short", "short_protein"] for sig in sigs: assert sig.name in expected_signames - if sig.name == 'short': - assert sig,minhash.hashes == sig1.minhash.hashes - if sig.name == 'short_protein': + if sig.name == "short": + assert sig, minhash.hashes == sig1.minhash.hashes + if sig.name == "short_protein": assert sig == sig2 def test_manysketch_prefix2(runtmp, capfd): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') + fa1 = get_test_data("short.fa") fa_path = os.path.dirname(fa1) # test without '*' - dna_prefix = os.path.join(fa_path, "short") # need to avoid matching short-protein.fa + dna_prefix = os.path.join( + fa_path, "short" + ) # need to avoid matching short-protein.fa prot_prefix = os.path.join(fa_path, "*protein") zip_exclude = os.path.join(fa_path, "*zip") # make prefix input file - with open(fa_csv, 'wt') as fp: + with open(fa_csv, "wt") as fp: fp.write("name,input_moltype,prefix,exclude\n") - fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa - fp.write(f"short_protein,protein,{prot_prefix},{zip_exclude}\n") # short-protein.fa only - - output = runtmp.output('prefix.zip') - - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1") + fp.write( + f"short,DNA,{dna_prefix},{prot_prefix}\n" + ) # short.fa, short2.fa, short3.fa, short-protein.fa + fp.write( + f"short_protein,protein,{prot_prefix},{zip_exclude}\n" + ) # short-protein.fa only + + output = runtmp.output("prefix.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + "-p", + "protein,k=10,scaled=1", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty captured = capfd.readouterr() print(captured.out) print(captured.err) - assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out + assert ( + "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." + in captured.out + ) assert "DONE. Processed 4 fasta files" in captured.err idx = sourmash.load_file_as_index(output) @@ -792,98 +1006,164 @@ def test_manysketch_prefix2(runtmp, capfd): assert len(sigs) == 2 # make same sigs with sourmash - fa2 = get_test_data('short2.fa') - fa3 = get_test_data('short3.fa') - fa4 = get_test_data('short-protein.fa') - s1 = runtmp.output('short.sig') - runtmp.sourmash('sketch', 'dna', fa1, fa2, fa3, '-o', s1, - '--param-str', "dna,k=31,scaled=1", '--name', 'short') + fa2 = get_test_data("short2.fa") + fa3 = get_test_data("short3.fa") + fa4 = get_test_data("short-protein.fa") + s1 = runtmp.output("short.sig") + runtmp.sourmash( + "sketch", + "dna", + fa1, + fa2, + fa3, + "-o", + s1, + "--param-str", + "dna,k=31,scaled=1", + "--name", + "short", + ) sig1 = sourmash.load_one_signature(s1) - s2 = runtmp.output('short-protein.sig') - runtmp.sourmash('sketch', 'protein', fa4, '-o', s2, - '--param-str', "protein,k=10,scaled=1", '--name', 'short_protein') + s2 = runtmp.output("short-protein.sig") + runtmp.sourmash( + "sketch", + "protein", + fa4, + "-o", + s2, + "--param-str", + "protein,k=10,scaled=1", + "--name", + "short_protein", + ) sig2 = sourmash.load_one_signature(s2) - expected_signames = ['short', 'short_protein'] + expected_signames = ["short", "short_protein"] for sig in sigs: assert sig.name in expected_signames - if sig.name == 'short': - assert sig,minhash.hashes == sig1.minhash.hashes - if sig.name == 'short_protein': + if sig.name == "short": + assert sig, minhash.hashes == sig1.minhash.hashes + if sig.name == "short_protein": assert sig == sig2 def test_manysketch_prefix_duplicated_fail(runtmp, capfd): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') + fa1 = get_test_data("short.fa") fa_path = os.path.dirname(fa1) # test without '*' - dna_prefix = os.path.join(fa_path, "short") # need to avoid matching short-protein.fa + dna_prefix = os.path.join( + fa_path, "short" + ) # need to avoid matching short-protein.fa prot_prefix = os.path.join(fa_path, "*protein") zip_exclude = os.path.join(fa_path, "*zip") # make prefix input file - with open(fa_csv, 'wt') as fp: + with open(fa_csv, "wt") as fp: fp.write("name,input_moltype,prefix,exclude\n") - fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa - fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # duplicate of row one -- this should just be skipped - fp.write(f"short_protein,protein,{prot_prefix},{zip_exclude}\n") # short-protein.fa only + fp.write( + f"short,DNA,{dna_prefix},{prot_prefix}\n" + ) # short.fa, short2.fa, short3.fa, short-protein.fa + fp.write( + f"short,DNA,{dna_prefix},{prot_prefix}\n" + ) # duplicate of row one -- this should just be skipped + fp.write( + f"short_protein,protein,{prot_prefix},{zip_exclude}\n" + ) # short-protein.fa only # ALSO short-protein.fa, but different name. should raise err without force fp.write(f"second_protein,protein,{prot_prefix},{zip_exclude}\n") - output = runtmp.output('prefix.zip') + output = runtmp.output("prefix.zip") with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1") + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + "-p", + "protein,k=10,scaled=1", + ) assert not os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty captured = capfd.readouterr() print(captured.out) print(captured.err) - assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out + assert ( + "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." + in captured.out + ) assert "Found identical FASTA paths in more than one row!" in captured.err assert "Duplicated paths:" in captured.err assert "short-protein.fa" in captured.err - assert "Duplicated FASTA files found. Please use --force to bypass this check" in captured.err + assert ( + "Duplicated FASTA files found. Please use --force to bypass this check" + in captured.err + ) def test_manysketch_prefix_duplicated_force(runtmp, capfd): - fa_csv = runtmp.output('db-fa.csv') + fa_csv = runtmp.output("db-fa.csv") - fa1 = get_test_data('short.fa') + fa1 = get_test_data("short.fa") fa_path = os.path.dirname(fa1) # test without '*' - dna_prefix = os.path.join(fa_path, "short") # need to avoid matching short-protein.fa + dna_prefix = os.path.join( + fa_path, "short" + ) # need to avoid matching short-protein.fa prot_prefix = os.path.join(fa_path, "*protein") zip_exclude = os.path.join(fa_path, "*zip") # make prefix input file - with open(fa_csv, 'wt') as fp: + with open(fa_csv, "wt") as fp: fp.write("name,input_moltype,prefix,exclude\n") - fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa - fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # duplicate of row one -- this should just be skipped - fp.write(f"short_protein,protein,{prot_prefix},{zip_exclude}\n") # short-protein.fa only + fp.write( + f"short,DNA,{dna_prefix},{prot_prefix}\n" + ) # short.fa, short2.fa, short3.fa, short-protein.fa + fp.write( + f"short,DNA,{dna_prefix},{prot_prefix}\n" + ) # duplicate of row one -- this should just be skipped + fp.write( + f"short_protein,protein,{prot_prefix},{zip_exclude}\n" + ) # short-protein.fa only # ALSO short-protein.fa, but different name. should raise err without force fp.write(f"second_protein,protein,{prot_prefix},{zip_exclude}\n") - output = runtmp.output('prefix.zip') - - runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, - '--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1", - '--force') + output = runtmp.output("prefix.zip") + + runtmp.sourmash( + "scripts", + "manysketch", + fa_csv, + "-o", + output, + "--param-str", + "dna,k=31,scaled=1", + "-p", + "protein,k=10,scaled=1", + "--force", + ) assert os.path.exists(output) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty captured = capfd.readouterr() print(captured.out) print(captured.err) - assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out - assert "Loaded 3 rows in total (3 DNA FASTA and 2 protein FASTA), 1 duplicate rows skipped." in captured.out + assert ( + "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." + in captured.out + ) + assert ( + "Loaded 3 rows in total (3 DNA FASTA and 2 protein FASTA), 1 duplicate rows skipped." + in captured.out + ) assert "Found identical FASTA paths in more than one row!" in captured.err assert "Duplicated paths:" in captured.err assert "short-protein.fa" in captured.err @@ -895,19 +1175,20 @@ def test_manysketch_prefix_duplicated_force(runtmp, capfd): assert len(sigs) == 3 + def test_singlesketch_simple(runtmp): """Test basic single sketching with default parameters.""" - fa1 = get_test_data('short.fa') - output = runtmp.output('short.sig') + fa1 = get_test_data("short.fa") + output = runtmp.output("short.sig") # Run the singlesketch command - runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output) + runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output) # Check if the output exists and contains the expected data assert os.path.exists(output) sig = sourmash.load_one_signature(output) - - assert sig.name == 'short.fa' + + assert sig.name == "short.fa" assert sig.minhash.ksize == 31 assert sig.minhash.is_dna assert sig.minhash.scaled == 1000 @@ -915,17 +1196,17 @@ def test_singlesketch_simple(runtmp): def test_singlesketch_with_name(runtmp): """Test single sketching with a custom name.""" - fa1 = get_test_data('short.fa') - output = runtmp.output('short_named.sig') + fa1 = get_test_data("short.fa") + output = runtmp.output("short_named.sig") # Run the singlesketch command with the --name option - runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-n', 'custom_name') + runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output, "-n", "custom_name") # Check if the output exists and contains the expected data assert os.path.exists(output) sig = sourmash.load_one_signature(output) - assert sig.name == 'custom_name' + assert sig.name == "custom_name" assert sig.minhash.ksize == 31 assert sig.minhash.is_dna assert sig.minhash.scaled == 1000 @@ -933,11 +1214,21 @@ def test_singlesketch_with_name(runtmp): def test_singlesketch_mult_k(runtmp): """Test single sketching with multiple k-mer sizes.""" - fa1 = get_test_data('short.fa') - output = runtmp.output('short_mult_k.sig') + fa1 = get_test_data("short.fa") + output = runtmp.output("short_mult_k.sig") # Run the singlesketch command with multiple k sizes - runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'k=21,scaled=100', '-p', 'k=31,scaled=100') + runtmp.sourmash( + "scripts", + "singlesketch", + fa1, + "-o", + output, + "-p", + "k=21,scaled=100", + "-p", + "k=31,scaled=100", + ) # Check if the output exists and contains the expected data assert os.path.exists(output) @@ -951,11 +1242,13 @@ def test_singlesketch_mult_k(runtmp): def test_singlesketch_mult_moltype(runtmp): """Test single sketching with different molecule types.""" - fa1 = get_test_data('short-protein.fa') - output = runtmp.output('short_mult_moltype.sig') + fa1 = get_test_data("short-protein.fa") + output = runtmp.output("short_mult_moltype.sig") # Run the singlesketch command with multiple molecule types - runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'protein,k=10,scaled=100') + runtmp.sourmash( + "scripts", "singlesketch", fa1, "-o", output, "-p", "protein,k=10,scaled=100" + ) # Check if the output exists and contains the expected data assert os.path.exists(output) @@ -969,12 +1262,14 @@ def test_singlesketch_mult_moltype(runtmp): def test_singlesketch_invalid_params(runtmp, capfd): """Test singlesketch command with invalid parameters.""" - fa1 = get_test_data('short.fa') - output = runtmp.output('short_invalid.sig') + fa1 = get_test_data("short.fa") + output = runtmp.output("short_invalid.sig") # Run the singlesketch command with an invalid parameter string with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'invalid_param') + runtmp.sourmash( + "scripts", "singlesketch", fa1, "-o", output, "-p", "invalid_param" + ) # Check that the error message is correct captured = capfd.readouterr()