From 94d82b1ecd5004ebb71723ff3dbb0e6de84b2e59 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Sep 2023 10:45:26 -0700 Subject: [PATCH] MRG: fix exponential time explosion in `sig check` (#2762) Fixes #2646. Embarrassing "let's make things run really slow" typo of the month... fixed! In brief, the following code in `manifest.py` was causing an exponential explosion in time cost: ```python def _add_rows(self, rows): self.rows.extend(rows) # maintain a fast check for md5sums for __contains__ check. md5set = self._md5_set for row in self.rows: md5set.add(row['md5']) ``` when called many times on a single row, because each addition of a row triggered an iteration over ALL rows. This PR changes things so that we do `for row in rows:`. Additional complexity => new tests to deal explicitly with the case where `rows` is a generator (which it turns out it often is, breaking dozens of tests when not accounted for) --- src/sourmash/manifest.py | 10 +++-- src/sourmash/sig/__main__.py | 6 ++- tests/test_manifest.py | 86 +++++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 8 deletions(-) diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index 4bcf1b7dbc..466bfa8e7a 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -232,14 +232,16 @@ def add_row(self, row): self._add_rows([row]) def _add_rows(self, rows): - self.rows.extend(rows) - - # maintain a fast check for md5sums for __contains__ check. md5set = self._md5_set - for row in self.rows: + + # only iterate once, in case it's a generator + for row in rows: + self.rows.append(row) md5set.add(row['md5']) def __iadd__(self, other): + if self is other: + raise Exception("cannot directly add manifest to itself") self._add_rows(other.rows) return self diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 7336f7ac79..48addfe1e4 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -14,7 +14,7 @@ from sourmash.sourmash_args import FileOutput from sourmash.logging import (set_quiet, error, notify, print_results, debug, - debug_literal) + debug_literal, _debug) from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled from sourmash.manifest import CollectionManifest @@ -1359,7 +1359,9 @@ def check(args): row['internal_location'] = filename total_manifest_rows.add_row(row) - debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows") + # the len(sub_manifest) here should only be run when needed :) + if _debug: + debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows") notify(f"loaded {total_rows_examined} signatures.") diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 3f63d4e88f..074d72d705 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -1,10 +1,11 @@ """ Tests for manifest code in databases, etc. """ +import pytest from io import StringIO import sourmash -from sourmash import index +from sourmash import index, sourmash_args import sourmash_tst_utils as utils @@ -49,7 +50,8 @@ def test_manifest_operations(): siglist.append(sig) manifest = index.CollectionManifest(rows) - manifest += manifest + manifest2 = index.CollectionManifest(rows) + manifest += manifest2 assert len(manifest) == 2*len(rows) assert len(manifest) == 4 @@ -59,6 +61,25 @@ def test_manifest_operations(): assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list +def test_manifest_operations_fail(): + # should not be able to add a manifest to itself - not only makes + # no sense, but it means you're modifying a generator in place, sometimes. + protzip = utils.get_test_data('prot/protein.zip') + + loader = sourmash.load_file_as_index(protzip) + + rows = [] + siglist = [] + for (sig, loc) in loader._signatures_with_internal(): + row = index.CollectionManifest.make_manifest_row(sig, loc) + rows.append(row) + siglist.append(sig) + + manifest = index.CollectionManifest(rows) + with pytest.raises(Exception): + manifest += manifest + + def test_manifest_to_picklist(): # test manifest/picklist interaction basics protzip = utils.get_test_data('prot/protein.zip') @@ -162,3 +183,64 @@ def test_save_load_manifest(): short_mf = index.CollectionManifest(rows) assert short_mf != manifest + + +def test_manifest_to_picklist_bug(runtmp): + # this tests a fun combination of things that led to a bug. + # tl;dr we only want to iterate once across a generator... + # ref #2762 + c = runtmp + all_zip = utils.get_test_data('prot/all.zip') + + idx = sourmash_args.load_file_as_index(all_zip) + assert len(idx) == 8 + + manifest = sourmash_args.get_manifest(idx) + assert len(manifest) == 8 + + def filter_fn(row): + # match? + keep = False + if "09a0869" in row['md5']: + keep = True + + return keep + + sub_manifest = manifest.filter_rows(filter_fn) + sub_picklist = sub_manifest.to_picklist() + idx = idx.select(picklist=sub_picklist) + + assert len(idx) == 1 + print(idx) + + x = list(idx.signatures()) + assert len(x) + + +def test_generate_manifest_iterate_once(): + # we should only iterate across manifest rows once + protzip = utils.get_test_data('prot/protein.zip') + + loader = sourmash.load_file_as_index(protzip) + + siglist = [] + for (sig, loc) in loader._signatures_with_internal(): + siglist.append(sig) + + # build generator function => will not allow iteration twice + def genfn(): + for (sig, loc) in loader._signatures_with_internal(): + row = index.CollectionManifest.make_manifest_row(sig, loc) + yield row + + manifest = index.CollectionManifest(genfn()) + + assert len(manifest) == 2 + assert len(manifest._md5_set) == 2 + + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + for sig in siglist: + assert sig in manifest